[Lynx-dev] patch to convert JIS X 0201 Katakana from Shift_JIS/EUC-JP to UTF-8

KIHARA Hideto Sat, 27 Oct 2018 00:28:35 -0700

Japanese JIS X 0201 Katakana characters in Shift_JIS/EUC-JP encoding
are not converted to UTF-8 correctly for UTF-8 display_charset.


Attached patch fixes JIS X 0201 Katakana conversion
from Shift_JIS/EUC-JP to UTF-8.

Screen captures about expected and actual behavior:
http://www1.interq.or.jp/~deton/lynx-sjisjisx0201/

-- 
KIHARA, Hideto / de...@m1.interq.or.jp

diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c
index 193e68d..1f20ee8 100644
--- a/WWW/Library/Implementation/SGML.c
+++ b/WWW/Library/Implementation/SGML.c
@@ -1670,6 +1670,13 @@ static void SGML_character(HTStream *me, int c_in)
 		    me->U.utf_buf[0] = (char) c;
 		    me->U.utf_count = 1;
 		    clong = -11;
+		} else if (IS_SJIS_X0201KANA(c)) {
+		    if (conv_jisx0201kana) {
+			JISx0201TO0208_SJIS(c, me->U.utf_buf, me->U.utf_buf + 1);
+			clong = UCTransJPToUni(me->U.utf_buf, 2, me->inUCLYhndl);
+		    } else {
+			clong = UCTransToUni(c, me->inUCLYhndl);
+		    }
 		}
 	    } else {
 		if (IS_SJIS_LO((unsigned char) c)) {
@@ -1680,7 +1687,7 @@ static void SGML_character(HTStream *me, int c_in)
 	    }
 	} else {
 	    if (me->U.utf_count == 0) {
-		if (IS_EUC_HI((unsigned char) c)) {
+		if (IS_EUC_HI((unsigned char) c) || c == 0x8E) {
 		    me->U.utf_buf[0] = (char) c;
 		    me->U.utf_count = 1;
 		    clong = -11;
diff --git a/src/UCdomap.c b/src/UCdomap.c
index 7c2ef7c..673b6ac 100644
--- a/src/UCdomap.c
+++ b/src/UCdomap.c
@@ -1217,6 +1217,16 @@ UCode_t UCTransToUni(int ch_in,
 		    buffer[0] = (char) ch_in;
 		    inx = 1;
 		    return ucNeedMore;
+		} else if (IS_SJIS_X0201KANA(ch_iu)) {
+		    buffer[0] = (char) ch_in;
+		    buffer[1] = 0;
+		    cd = iconv_open("UTF-16BE", "Shift_JIS");
+		    ilen = 1;
+		    (void) iconv(cd, (ICONV_CONST char **) &pin, &ilen, &pout, &olen);
+		    iconv_close(cd);
+		    if ((ilen == 0) && (olen == 0)) {
+			return (UCH(obuffer[0]) << 8) + UCH(obuffer[1]);
+		    }
 		}
 	    } else {
 		if (IS_SJIS_LO(ch_iu)) {
@@ -1235,7 +1245,7 @@ UCode_t UCTransToUni(int ch_in,
 	}
 	if (strcmp(LYCharSet_UC[charset_in].MIMEname, "euc-jp") == 0) {
 	    if (inx == 0) {
-		if (IS_EUC_HI(ch_iu)) {
+		if (IS_EUC_HI(ch_iu) || ch_iu == 0x8E) {
 		    buffer[0] = (char) ch_in;
 		    inx = 1;
 		    return ucNeedMore;

_______________________________________________
Lynx-dev mailing list
Lynx-dev@nongnu.org
https://lists.nongnu.org/mailman/listinfo/lynx-dev

[Lynx-dev] patch to convert JIS X 0201 Katakana from Shift_JIS/EUC-JP to UTF-8

Reply via email to