In non-utf-8 mode, if a character is not in the current codepage, it may
be replaced by an ascii string. For example, "ϐ" is shown as
"beta". The set of substitutions is in Unicode/7bitrepl.lnx.
This patch does the same in a linux console in utf-8 mode. It obtains the
unicode mapping for the current font, and if a character to be displayed is
not in it but has a replacement, it shows the replacement instead.
I believe the concept is useful, and the implementation kind of works.
However, a number of things are not done exactly as they should:
- getting the unicode mapping should go in osdep.c instead of charsets.c
- the search in "codepoints" is done once for getting the size of the
replacement and one for the actual replacement
- the inclusion of sys/kd.h should be conditional
- allocating charsel*20 bytes for a line may be an overkill, and 20 is a
magic number (should be the maximal size of an ascii replacement)
diff -Naur elinks-0.13-20170507/src/document/html/renderer.c
elinks-0.13-modified/src/document/html/renderer.c
--- elinks-0.13-20170507/src/document/html/renderer.c 2017-03-06
21:47:40.000000000 +0100
+++ elinks-0.13-modified/src/document/html/renderer.c 2017-05-11
16:18:08.000000000 +0200
@@ -527,8 +527,10 @@
* incomplete character in document->buf, then
* the first byte of input can result in a double-cell
* character, so we must reserve one extra element. */
+ /* But, ascii replacements complicates this because a single
+ codepoint may be rendered to as much as 20 chars */
orig_length = realloc_line(html_context, document,
- Y(y), X(x) + charslen);
+ Y(y), X(x) + charslen * 20);
if (orig_length < 0) /* error */
return 0;
if (utf8) {
@@ -640,7 +642,16 @@
#endif /* CONFIG_COMBINE */
part->spaces[x] = (data == UCS_SPACE);
- if (unicode_to_cell(data) == 2) {
+ if (codepoint_replacement(data) != -1) {
+ int i;
+
+ for(i = 0; i < unicode_to_cell(data);
i++) {
+ schar->data =
encode_utf8(data)[i];
+ part->char_width[x] = 1;
+ copy_screen_chars(&POS(x++, y),
schar, 1);
+ }
+ continue;
+ } else if (unicode_to_cell(data) == 2) {
schar->data = (unicode_val_T)data;
part->char_width[x] = 2;
copy_screen_chars(&POS(x++, y), schar,
1);
diff -Naur elinks-0.13-20170507/src/intl/charsets.c
elinks-0.13-modified/src/intl/charsets.c
--- elinks-0.13-20170507/src/intl/charsets.c 2017-03-06 21:47:40.000000000
+0100
+++ elinks-0.13-modified/src/intl/charsets.c 2017-05-11 16:17:56.000000000
+0200
@@ -23,6 +23,9 @@
#include <iconv.h>
#endif
+#include <sys/ioctl.h>
+#include <sys/kd.h>
+
#include "elinks.h"
#include "document/options.h"
@@ -33,6 +36,7 @@
#include "util/hash.h"
#include "util/memory.h"
#include "util/string.h"
+#include "osdep/osdep.h"
/* Fix namespace clash on MacOS. */
@@ -185,6 +189,46 @@
}
\
}
\
+/* list of unicode codepoints supported by the current terminal, if this
+ * information is available, otherwise size = -1 */
+
+struct {
+ int size;
+ unicode_val_T *list;
+} codepoints;
+
+int is_codepoint_supported(unicode_val_T u) {
+ int first, last, middle;
+
+ if (codepoints.size == -1)
+ return 1;
+
+ first = 0;
+ last = codepoints.size - 1;
+
+ while (first <= last) {
+ middle = (last + first) / 2;
+ if (codepoints.list[middle] == u)
+ return u;
+ else if (codepoints.list[middle] > u)
+ last = middle - 1;
+ else
+ first = middle + 1;
+ }
+
+ return 0;
+}
+
+int codepoint_replacement(unicode_val_T u) {
+ int s;
+
+ if (is_codepoint_supported(u))
+ return -1;
+
+ BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
+ return s;
+}
+
static const unicode_val_T strange_chars[32] = {
0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
@@ -240,8 +284,15 @@
NONSTATIC_INLINE unsigned char *
encode_utf8(unicode_val_T u)
{
+ int s;
+
memset(utf_buffer, 0, 7);
+ if (!is_codepoint_supported(u)) {
+ BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
+ if (s != -1) return unicode_7b[s].s;
+ }
+
if (u < 0x80)
utf_buffer[0] = u;
else if (u < 0x800)
@@ -611,6 +662,13 @@
NONSTATIC_INLINE int
unicode_to_cell(unicode_val_T c)
{
+ int s;
+
+ if (!is_codepoint_supported(c)) {
+ BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, c, s);
+ if (s != -1) return strlen(unicode_7b[s].s);
+ }
+
if (c >= 0x1100
&& (c <= 0x115f /* Hangul Jamo */
|| c == 0x2329
@@ -861,8 +919,8 @@
else {
struct conv_table *nct;
- assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
- if_assert_failed return;
+ // assertm(ct[*p].u.str == no_str, "bad utf encoding
#1");
+ // if_assert_failed return;
nct = mem_calloc(256, sizeof(*nct));
if (!nct) return;
@@ -874,8 +932,8 @@
p++;
}
- assertm(!ct[*p].t, "bad utf encoding #2");
- if_assert_failed return;
+ // assertm(!ct[*p].t, "bad utf encoding #2");
+ // if_assert_failed return;
if (ct[*p].u.str == no_str)
ct[*p].u.str = str;
@@ -1597,9 +1655,71 @@
#endif /* USE_FASTFIND */
+/* create the list of codepoints supported by the terminal */
+
+#ifdef GIO_UNIMAP
+int cmpint(const void *a, const void *b) {
+ if (* (int *) a < * (int *) b)
+ return -1;
+ else if (* (int *) a == * (int *) b)
+ return 0;
+ else
+ return 1;
+}
+
+void make_codepoints() {
+ int tty;
+ struct unimapdesc table;
+ int res;
+ int i;
+
+ tty = get_ctl_handle();
+ if (tty == -1) {
+ codepoints.size = -1;
+ return ;
+ }
+
+ table.entry_ct = -1;
+ table.entries = NULL;
+ res = ioctl(tty, GIO_UNIMAP, &table);
+ if (res) {
+ perror("GIO_UNIMAP");
+ close(tty);
+ codepoints.size = -1;
+ return;
+ }
+
+ table.entries = malloc(table.entry_ct * sizeof(struct unipair));
+ res = ioctl(tty, GIO_UNIMAP, &table);
+ if (res) {
+ perror("GIO_UNIMAP");
+ close(tty);
+ codepoints.size = -1;
+ return;
+ }
+
+ close(tty);
+
+ codepoints.size = table.entry_ct;
+ codepoints.list = malloc(table.entry_ct * sizeof(unicode_val_T));
+ for (i = 0; i < table.entry_ct; i++)
+ codepoints.list[i] = table.entries[i].unicode;
+
+ qsort(codepoints.list, codepoints.size, sizeof(unicode_val_T), cmpint);
+
+ // for (i = 0; i < codepoints.size; i++)
+ // fprintf(stderr, "U+%04X\n", codepoints.list[i]);
+}
+#else
+void make_codepoints() {
+ codepoints.size = -1;
+}
+#endif
+
void
init_charsets_lookup(void)
{
+ make_codepoints();
#ifdef USE_FASTFIND
fastfind_index(&ff_charsets_index, FF_COMPRESS);
#endif
diff -Naur elinks-0.13-20170507/src/intl/charsets.h
elinks-0.13-modified/src/intl/charsets.h
--- elinks-0.13-20170507/src/intl/charsets.h 2017-03-06 21:47:40.000000000
+0100
+++ elinks-0.13-modified/src/intl/charsets.h 2017-05-11 16:17:56.000000000
+0200
@@ -156,6 +156,8 @@
int, enum utf8_step, int *);
unsigned char *utf8_step_backward(unsigned char *, unsigned char *,
int, enum utf8_step, int *);
+int is_codepoint_supported(unicode_val_T u);
+int codepoint_replacement(unicode_val_T u);
int unicode_to_cell(unicode_val_T);
unicode_val_T unicode_fold_label_case(unicode_val_T);
int strlen_utf8(unsigned char **);
--
http://lists.linuxfromscratch.org/listinfo/elinks-dev
Unsubscribe: See the above information page