The attached patch implements some support for changing from a UTF-8
encoded file to an ASCII one. This is against SVN version 5400.
set_file_lc_type() checks if the file contains a Local Variables
section with a coding line. Then the convert_characters() function can
substitute characters in the file if the document encoding is
different from the encoding from the environment. At the moment only
three characters in UTF-8 are implemented.

I used the strcasecmp and nl_langinfo functions; I'm not sure if they
are standard.

I've attached a file I used to test this patch - it gives the expected
different behaviour in UTF-8 and Latin 1 terminals for me.

On Wed, Jan 1, 2014 at 12:15 AM, Karl Berry <[email protected]> wrote:
> In my experience, the problem is not specific to Info and not specific
> to quotes.  If I run cat or more or ... on a UTF-8 file in a non-UTF-8
> terminal, characters are dropped and the result beyond 7-bit ASCII is
> garbled.
>
> This has always seemed like a fundamental problem in UTF-8 usage to me,
> one that would be better addressed at the terminal level, so at least
> one can always see the bytes, if not the "best possible"
> transliteration, without every single program that writes to stdout
> having to implement the same thing.  But since nothing like that is
> going to happen, I suppose Info should somehow deal with it, just like
> every other program in the world.  Sigh.  Patches are welcome.
>
> As for controlling the output of quotes by makeinfo, an option could be
> invented, but I am not inclined to change the default behavior so I'm
> not convinced it has much utility.  We changed it in the first place
> because of vociferous complaints about getting ASCII quotes even with
> @documentencoding UTF-8.  And after all, there is some logic to using
> UTF-8 quotes when the document says it wants UTF-8.  It's no different
> in principle than accented letters.
>
> At any rate, the best answer, IMHO, not requiring any changes to any
> programs, is simply not to use @documentencoding UTF-8 unless one
> actually needs it, which should be never in English-language manuals.
> 7-bit ASCII source with Texinfo @-commands is preferable.  These days
> many people reflexively think that UTF-8 is wonderful, always use it,
> and want to inflict it on everyone else too, but that is simply wrong.
>
> karl
>
diff -x 'Makefile*' -x '*.o' -x '*~' texinfo/trunk/info/nodes.c info-locale/info/nodes.c
29a30,31
> #include <nl_types.h>
> #include <langinfo.h>
44a47,48
> static void set_file_lc_ctype (FILE_BUFFER *fb);
> static void convert_characters (FILE_BUFFER *fb);
328a333,485
> char *locale_names[] = { "US-ASCII", "UTF-8", "ISO-8859-1", "ISO-8859-2",
>                        "ISO-8859-15", 0 };
> 
> /* Look for local variables section in FB and set encoding */
> static void
> set_file_lc_ctype (FILE_BUFFER *fb)
> {
>   SEARCH_BINDING binding;
>   long position;
> 
>   long int enc_start, enc_end;
>   char *enc_string;
> 
>   char **locale_name;
> 
>   /* See if there is a local variables section in this info file. */
>   binding.buffer = fb->contents;
>   binding.start = fb->filesize;
>   binding.end = binding.start - 1000;
>   if (binding.end < 0)
>     binding.end = 0;
>   binding.flags = S_FoldCase;
> 
>   fb->lc_ctype = ENC_UNKNOWN;
> 
>   if (search_backward (LOCAL_VARIABLES_LABEL, &binding, &position)
>       != search_success)
>     return;
> 
>   binding.start = position;
>   binding.end = fb->filesize;
> 
>   if (search_forward ("coding:", &binding, &enc_start)
>       != search_success)
>     return;
> 
>   enc_start += 7; /* Skip to after "coding:" */
>   enc_start += skip_whitespace(fb->contents + enc_start);
>   binding.start = enc_start;
> 
>   search_forward ("\n", &binding, &enc_end);
> 
>   enc_string = xmalloc (enc_end - enc_start + 1);
>   strncpy (enc_string, fb->contents + enc_start, enc_end - enc_start);
>   enc_string[enc_end - enc_start] = '\0';
> 
>   for (locale_name = locale_names; *locale_name != 0; locale_name++)
>     if (!strcasecmp(enc_string, *locale_name))
>       fb->lc_ctype = locale_name - locale_names;
> }
> 
> /* The degrade functions replace one character in their encoding at *C with
>  * an ASCII equivalent and return the number of extra bytes. *C is left to
>  * point at the end of written bytes. At present adding more bytes than 
>  * originally existed is not possible */
> 
> static int
> degrade_dummy (char **c) {}
> 
> struct encoding_replacement
> {
>   char *from_string;
>   char *to_string;
> };
> 
> static int
> degrade_utf8 (char **c)
> {
>   struct encoding_replacement er[] = {
>   {"\xc3\xb6","o"}, /* lower-case o with umlaut */
>   {"\xe2\x80\x98","'"}, /* Opening quote */
>   {"\xe2\x80\x99","'"}, /* Closing quote */
>   {0, 0}};
> 
>   struct encoding_replacement *erp;
> 
>   for (erp = er; erp->from_string != 0; erp++)
>     {
>       if (!strncmp (*c, erp->from_string, strlen (erp->from_string)))
>         {
>           strncpy(*c, erp->to_string, strlen(erp->to_string));
>           *c += strlen (erp->to_string) - 1;
>           return strlen (erp->from_string) - strlen (erp->to_string);
>         }
>     }
>   return 0;
> }
> 
> /* Convert characters in the nodes for FB to the current locale */
> static void
> convert_characters (FILE_BUFFER *fb)
> {
>   char *c;
>   long node = 0, nextnode;
>   SEARCH_BINDING binding;
>   char *to_locale;
> 
>   int (*degrade_funcs[5])(char **) = {
>     degrade_dummy, degrade_utf8, degrade_dummy,
>     degrade_dummy, degrade_dummy };
> 
>   int (*degrade)(char **);
> 
>   if (fb->lc_ctype == ENC_UNKNOWN) return;
> 
>   /* Read environment locale */
> 
>   to_locale = nl_langinfo(CODESET);
> 
>   /* Don't degrade the contents if we are in fact
>    * in the right locale for the file */
>   if (!strcasecmp(to_locale, locale_names[fb->lc_ctype]))
>     return;
> 
>   degrade = degrade_funcs [fb->lc_ctype];
> 
>   /* Return if no conversion function implemented */
>   if (degrade == degrade_dummy) return;
> 
>   binding.buffer = fb->contents;
>   binding.start = 0;
>   binding.end = fb->filesize;
> 
>   /* Loop between node_separators. The second part of the condition
>    * makes us operate on the last node, which does not end with a
>    * node separator (although it will be a tags table or local variables
>    * section anyway). */
>   while ((nextnode = find_node_separator (&binding)) != -1
>     || (node != fb->filesize && (nextnode = fb->filesize)))
>     {
>       binding.start = nextnode + 1;
> 
>       /* Convert characters from node to nextnode */
> 
>       for (c = binding.buffer + node; c < binding.buffer + nextnode; c++)
>         {
>           int shrink_by;
>           shrink_by = degrade(&c);
>           if (shrink_by != 0) 
>             {
>               /* Shift rest of file backwards by shrink_by bytes */
>               memmove (c + 1, c + 1 + shrink_by,
>                       (fb->contents + fb->filesize) - (c + 1 + shrink_by));
>               fb->filesize -= shrink_by;
>               binding.end -= shrink_by;
>             }
>         }
> 
>       node = nextnode;
>       node += skip_whitespace (binding.buffer + node);
>     }
> }
> 
399a557,559
> 
>   /* Find encoding of file, if set */
>   set_file_lc_ctype(file_buffer);
400a561,564
>   /* Convert characters in file buffer to current locale as much
>    * as possible. */
>   convert_characters (file_buffer);
> 
diff -x 'Makefile*' -x '*.o' -x '*~' texinfo/trunk/info/nodes.h info-locale/info/nodes.h
74a75
> #define LOCAL_VARIABLES_LABEL		"Local Variables"
114a116
>   int lc_ctype;			/* Encoding - index into locale_names */
115a118,124
> 
> /* Null-terminated array of strings naming locales that 
> extern char *locale_names[];
> 
> /* Value of FILE_BUFFER.lc_ctype if encoding is unknown */
> #define ENC_UNKNOWN     -1
> 

Attachment: utf8.info
Description: Binary data

Reply via email to