[elinks-dev] [PATCH] ascii replacements in utf-8 mode on a linux console

sgerwk Thu, 11 May 2017 10:42:54 -0700

In non-utf-8 mode, if a character is not in the current codepage, it may

be replaced by an ascii string. For example, "ϐ" is shown as"beta". The set of substitutions is in Unicode/7bitrepl.lnx.


This patch does the same in a linux console in utf-8 mode. It obtains the
unicode mapping for the current font, and if a character to be displayed is
not in it but has a replacement, it shows the replacement instead.

I believe the concept is useful, and the implementation kind of works.
However, a number of things are not done exactly as they should:

- getting the unicode mapping should go in osdep.c instead of charsets.c
- the search in "codepoints" is done once for getting the size of the
  replacement and one for the actual replacement
- the inclusion of sys/kd.h should be conditional
- allocating charsel*20 bytes for a line may be an overkill, and 20 is a
  magic number (should be the maximal size of an ascii replacement)

diff -Naur elinks-0.13-20170507/src/document/html/renderer.c 
elinks-0.13-modified/src/document/html/renderer.c
--- elinks-0.13-20170507/src/document/html/renderer.c   2017-03-06 
21:47:40.000000000 +0100
+++ elinks-0.13-modified/src/document/html/renderer.c   2017-05-11 
16:18:08.000000000 +0200
@@ -527,8 +527,10 @@
                 * incomplete character in document->buf, then
                 * the first byte of input can result in a double-cell
                 * character, so we must reserve one extra element.  */
+               /* But, ascii replacements complicates this because a single
+                  codepoint may be rendered to as much as 20 chars */
                orig_length = realloc_line(html_context, document,
-                                          Y(y), X(x) + charslen);
+                                          Y(y), X(x) + charslen * 20);
                if (orig_length < 0) /* error */
                        return 0;
                if (utf8) {
@@ -640,7 +642,16 @@
 #endif /* CONFIG_COMBINE */
                                part->spaces[x] = (data == UCS_SPACE);

-                               if (unicode_to_cell(data) == 2) {
+                               if (codepoint_replacement(data) != -1) {
+                                       int i;
+
+                                       for(i = 0; i < unicode_to_cell(data); 
i++) {
+                                               schar->data = 
encode_utf8(data)[i];
+                                               part->char_width[x] = 1;
+                                               copy_screen_chars(&POS(x++, y), 
schar, 1);
+                                       }
+                                       continue;
+                               } else if (unicode_to_cell(data) == 2) {
                                        schar->data = (unicode_val_T)data;
                                        part->char_width[x] = 2;
                                        copy_screen_chars(&POS(x++, y), schar, 
1);
diff -Naur elinks-0.13-20170507/src/intl/charsets.c 
elinks-0.13-modified/src/intl/charsets.c
--- elinks-0.13-20170507/src/intl/charsets.c    2017-03-06 21:47:40.000000000 
+0100
+++ elinks-0.13-modified/src/intl/charsets.c    2017-05-11 16:17:56.000000000 
+0200
@@ -23,6 +23,9 @@
 #include <iconv.h>
 #endif

+#include <sys/ioctl.h>
+#include <sys/kd.h>
+
 #include "elinks.h"

 #include "document/options.h"
@@ -33,6 +36,7 @@
 #include "util/hash.h"
 #include "util/memory.h"
 #include "util/string.h"
+#include "osdep/osdep.h"


 /* Fix namespace clash on MacOS. */
@@ -185,6 +189,46 @@
        }                                                                       
        \
 }                                                                              
        \

+/* list of unicode codepoints supported by the current terminal, if this
+ * information is available, otherwise size = -1 */
+
+struct {
+       int size;
+       unicode_val_T *list;
+} codepoints;
+
+int is_codepoint_supported(unicode_val_T u) {
+       int first, last, middle;
+
+       if (codepoints.size == -1)
+               return 1;
+
+       first = 0;
+       last = codepoints.size - 1;
+
+       while (first <= last) {
+               middle = (last + first) / 2;
+               if (codepoints.list[middle] == u)
+                       return u;
+               else if (codepoints.list[middle] > u)
+                       last = middle - 1;
+               else
+                       first = middle + 1;
+       }
+
+       return 0;
+}
+
+int codepoint_replacement(unicode_val_T u) {
+       int s;
+
+       if (is_codepoint_supported(u))
+               return -1;
+
+       BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
+       return s;
+}
+
 static const unicode_val_T strange_chars[32] = {
 0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
 0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
@@ -240,8 +284,15 @@
 NONSTATIC_INLINE unsigned char *
 encode_utf8(unicode_val_T u)
 {
+       int s;
+
        memset(utf_buffer, 0, 7);

+       if (!is_codepoint_supported(u)) {
+               BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
+               if (s != -1) return unicode_7b[s].s;
+       }
+
        if (u < 0x80)
                utf_buffer[0] = u;
        else if (u < 0x800)
@@ -611,6 +662,13 @@
 NONSTATIC_INLINE int
 unicode_to_cell(unicode_val_T c)
 {
+       int s;
+
+       if (!is_codepoint_supported(c)) {
+               BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, c, s);
+               if (s != -1) return strlen(unicode_7b[s].s);
+       }
+
        if (c >= 0x1100
                && (c <= 0x115f                      /* Hangul Jamo */
                || c == 0x2329
@@ -861,8 +919,8 @@
                else {
                        struct conv_table *nct;

-                       assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
-                       if_assert_failed return;
+                       // assertm(ct[*p].u.str == no_str, "bad utf encoding 
#1");
+                       // if_assert_failed return;

                        nct = mem_calloc(256, sizeof(*nct));
                        if (!nct) return;
@@ -874,8 +932,8 @@
                p++;
        }

-       assertm(!ct[*p].t, "bad utf encoding #2");
-       if_assert_failed return;
+       // assertm(!ct[*p].t, "bad utf encoding #2");
+       // if_assert_failed return;

        if (ct[*p].u.str == no_str)
                ct[*p].u.str = str;
@@ -1597,9 +1655,71 @@

 #endif /* USE_FASTFIND */

+/* create the list of codepoints supported by the terminal */
+
+#ifdef GIO_UNIMAP
+int cmpint(const void *a, const void *b) {
+       if (* (int *) a < * (int *) b)
+               return -1;
+       else if (* (int *) a == * (int *) b)
+               return 0;
+       else
+               return 1;
+}
+
+void make_codepoints() {
+       int tty;
+       struct unimapdesc table;
+       int res;
+       int i;
+
+       tty = get_ctl_handle();
+       if (tty == -1) {
+               codepoints.size = -1;
+               return ;
+       }
+
+       table.entry_ct = -1;
+       table.entries = NULL;
+       res = ioctl(tty, GIO_UNIMAP, &table);
+       if (res) {
+               perror("GIO_UNIMAP");
+               close(tty);
+               codepoints.size = -1;
+               return;
+       }
+
+       table.entries = malloc(table.entry_ct * sizeof(struct unipair));
+       res = ioctl(tty, GIO_UNIMAP, &table);
+       if (res) {
+               perror("GIO_UNIMAP");
+               close(tty);
+               codepoints.size = -1;
+               return;
+       }
+
+       close(tty);
+
+       codepoints.size = table.entry_ct;
+       codepoints.list = malloc(table.entry_ct * sizeof(unicode_val_T));
+       for (i = 0; i < table.entry_ct; i++)
+               codepoints.list[i] = table.entries[i].unicode;
+
+       qsort(codepoints.list, codepoints.size, sizeof(unicode_val_T), cmpint);
+
+       // for (i = 0; i < codepoints.size; i++)
+       //      fprintf(stderr, "U+%04X\n", codepoints.list[i]);
+}
+#else
+void make_codepoints() {
+       codepoints.size = -1;
+}
+#endif
+
 void
 init_charsets_lookup(void)
 {
+       make_codepoints();
 #ifdef USE_FASTFIND
        fastfind_index(&ff_charsets_index, FF_COMPRESS);
 #endif
diff -Naur elinks-0.13-20170507/src/intl/charsets.h 
elinks-0.13-modified/src/intl/charsets.h
--- elinks-0.13-20170507/src/intl/charsets.h    2017-03-06 21:47:40.000000000 
+0100
+++ elinks-0.13-modified/src/intl/charsets.h    2017-05-11 16:17:56.000000000 
+0200
@@ -156,6 +156,8 @@
                                 int, enum utf8_step, int *);
 unsigned char *utf8_step_backward(unsigned char *, unsigned char *,
                                  int, enum utf8_step, int *);
+int is_codepoint_supported(unicode_val_T u);
+int codepoint_replacement(unicode_val_T u);
 int unicode_to_cell(unicode_val_T);
 unicode_val_T unicode_fold_label_case(unicode_val_T);
 int strlen_utf8(unsigned char **);
--
http://lists.linuxfromscratch.org/listinfo/elinks-dev
Unsubscribe: See the above information page

[elinks-dev] [PATCH] ascii replacements in utf-8 mode on a linux console

Reply via email to