Attached patch addes conversion from Japanese ISO-2022-JP html
to UTF-8 display_charset.

Screen captures:
http://www1.interq.or.jp/~deton/lynx-jis2utf8/

-- 
KIHARA, Hideto / de...@m1.interq.or.jp
diff --git a/WWW/Library/Implementation/SGML.c b/WWW/Library/Implementation/SGML.c
index 1f20ee8..5fbea95 100644
--- a/WWW/Library/Implementation/SGML.c
+++ b/WWW/Library/Implementation/SGML.c
@@ -93,6 +93,11 @@ static void fake_put_character(HTStream *p GCC_UNUSED,
 /*the following macros are used for pretty source view. */
 #define IS_C(attr) (attr.type == HTMLA_CLASS)
 
+#if defined(ISO2022JP_TOUTF8) && defined(EXP_JAPANESEUTF8_SUPPORT)
+# define UTF8_TTY_ISO2022JP (me->T.output_utf8)
+#else
+# define UTF8_TTY_ISO2022JP 0
+#endif
 HTCJKlang HTCJK = NOCJK;	/* CJK enum value.              */
 BOOL HTPassEightBitRaw = FALSE;	/* Pass 161-172,174-255 raw.    */
 BOOL HTPassEightBitNum = FALSE;	/* Pass ^ numeric entities raw. */
@@ -1659,6 +1664,34 @@ static void SGML_character(HTStream *me, int c_in)
     /*
      * If we want the raw input converted to Unicode, try that now.  - FM
      */
+    /* Convert ISO-2022-JP to Unicode (charset=iso-2022-jp is unrecognized) */
+#define IS_JIS7_HILO(c) (0x20<(c)&&(c)<0x7F)
+    if (UTF8_TTY_ISO2022JP && (me->state == S_nonascii_text
+	    || me->state == S_nonascii_text_sq
+	    || me->state == S_nonascii_text_dq)) {
+	/* end of ISO-2022-JP? || not in ISO-2022-JP range */
+	if (TOASCII(c) == '\033' || !IS_JIS7_HILO(c)) {
+	    me->kanji_buf = '\0';
+	    goto top1;
+        }
+	if (me->kanji_buf == '\t') { /* flag for single byte kana in "ESC(I" */
+	    if (conv_jisx0201kana) {
+		JISx0201TO0208_SJIS(c | 0200, me->U.utf_buf, me->U.utf_buf + 1);
+		clong = UCTransJPToUni(me->U.utf_buf, 2, UCGetLYhndl_byMIME("shift_jis"));
+	    } else {
+		clong = UCTransToUni(c | 0200, UCGetLYhndl_byMIME("shift_jis"));
+	    }
+	} else if (me->kanji_buf) {
+	    me->U.utf_buf[0] = me->kanji_buf | 0200; /* to EUC-JP */
+	    me->U.utf_buf[1] = c | 0200;
+	    clong = UCTransJPToUni(me->U.utf_buf, 2, UCGetLYhndl_byMIME("euc-jp"));
+	    me->kanji_buf = '\0';
+	} else {
+	    me->kanji_buf = c;
+	    clong = ucNeedMore;
+	}
+	goto top1;
+    }
     if (me->T.trans_to_uni &&
 #ifdef EXP_JAPANESEUTF8_SUPPORT
 	((strcmp(LYCharSet_UC[me->inUCLYhndl].MIMEname, "euc-jp") == 0) ||
@@ -1808,7 +1841,8 @@ static void SGML_character(HTStream *me, int c_in)
      */
     if (TOASCII(clong) < 32 &&
 	c != '\t' && c != '\n' && c != '\r' &&
-	!IS_CJK_TTY)
+	!IS_CJK_TTY &&
+	!(UTF8_TTY_ISO2022JP && TOASCII(c) == '\033'))
 	goto after_switch;
 
     /*
@@ -1916,13 +1950,14 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_in_kanji;
 	    me->kanji_buf = c;
 	    break;
-	} else if (IS_CJK_TTY && TOASCII(c) == '\033') {	/* S/390 -- gil -- 0881 */
+	} else if ((IS_CJK_TTY || UTF8_TTY_ISO2022JP) && TOASCII(c) == '\033') {	/* S/390 -- gil -- 0881 */
 	    /*
 	     * Setting up for CJK escape sequence handling (based on Takuya
 	     * ASADA's (as...@three-a.co.jp) CJK Lynx).  - FM
 	     */
 	    me->state = S_esc;
-	    PUTC(c);
+	    if (!UTF8_TTY_ISO2022JP)
+		PUTC(c);
 	    break;
 	}
 
@@ -3649,7 +3684,8 @@ static void SGML_character(HTStream *me, int c_in)
 	     * - Takuya ASADA (as...@three-a.co.jp)
 	     */
 	    me->state = S_esc_sq;
-	    HTChunkPutc(string, c);
+	    if (!UTF8_TTY_ISO2022JP)
+		HTChunkPutc(string, c);
 	} else if (me->T.decode_utf8 &&
 		   *me->U.utf_buf) {
 	    HTChunkPuts(string, me->U.utf_buf);
@@ -3693,7 +3729,8 @@ static void SGML_character(HTStream *me, int c_in)
 	     * - Takuya ASADA (as...@three-a.co.jp)
 	     */
 	    me->state = S_esc_dq;
-	    HTChunkPutc(string, c);
+	    if (!UTF8_TTY_ISO2022JP)
+		HTChunkPutc(string, c);
 	} else if (me->T.decode_utf8 &&
 		   *me->U.utf_buf) {
 	    HTChunkPuts(string, me->U.utf_buf);
@@ -3956,8 +3993,11 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_paren;
 	} else {
 	    me->state = S_text;
+	    if (UTF8_TTY_ISO2022JP)
+		goto top1;
 	}
-	PUTC(c);
+	if (!UTF8_TTY_ISO2022JP)
+	    PUTC(c);
 	break;
 
     case S_dollar:		/* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
@@ -3966,7 +4006,8 @@ static void SGML_character(HTStream *me, int c_in)
 	} else if (c == '(') {
 	    me->state = S_dollar_paren;
 	}
-	PUTC(c);
+	if (!UTF8_TTY_ISO2022JP)
+	    PUTC(c);
 	break;
 
     case S_dollar_paren:	/* Expecting 'C' after CJK "ESC$(". */
@@ -3974,8 +4015,13 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_nonascii_text;
 	} else {
 	    me->state = S_text;
+	    if (UTF8_TTY_ISO2022JP) {
+		PUTS("$(");
+		goto top1;
+	    }
 	}
-	PUTC(c);
+	if (!UTF8_TTY_ISO2022JP)
+	    PUTC(c);
 	break;
 
     case S_paren:		/* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
@@ -3983,19 +4029,30 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_text;
 	} else if (c == 'I') {
 	    me->state = S_nonascii_text;
+	    if (UTF8_TTY_ISO2022JP)
+		me->kanji_buf = '\t'; /* flag for single byte katakana */
 	} else {
 	    me->state = S_text;
+	    if (UTF8_TTY_ISO2022JP) {
+		PUTC('(');
+		goto top1;
+	    }
 	}
-	PUTC(c);
+	if (!UTF8_TTY_ISO2022JP)
+	    PUTC(c);
 	break;
 
     case S_nonascii_text:	/* Expecting CJK ESC after non-ASCII text. */
 	if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1264 */
 	    me->state = S_esc;
-	}
-	PUTC(c);
-	if (c < 32)
+	} else if (c < 32) {
 	    me->state = S_text;
+	}
+	if (UTF8_TTY_ISO2022JP) {
+	    if (TOASCII(c) != '\033')
+		PUTUTF8(clong);
+	} else
+	    PUTC(c);
 	break;
 
     case S_esc_sq:		/* Expecting '$'or '(' following CJK ESC. */
@@ -4005,8 +4062,11 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_paren_sq;
 	} else {
 	    me->state = S_squoted;
+	    if (UTF8_TTY_ISO2022JP)
+		goto top1;
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_dollar_sq:		/* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
@@ -4015,7 +4075,8 @@ static void SGML_character(HTStream *me, int c_in)
 	} else if (c == '(') {
 	    me->state = S_dollar_paren_sq;
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_dollar_paren_sq:	/* Expecting 'C' after CJK "ESC$(". */
@@ -4023,8 +4084,13 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_nonascii_text_sq;
 	} else {
 	    me->state = S_squoted;
+	    if (UTF8_TTY_ISO2022JP) {
+		HTChunkPuts(string, "$(");
+		goto top1;
+	    }
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_paren_sq:		/* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
@@ -4032,17 +4098,28 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_squoted;
 	} else if (c == 'I') {
 	    me->state = S_nonascii_text_sq;
+	    if (UTF8_TTY_ISO2022JP)
+		me->kanji_buf = '\t'; /* flag for single byte katakana */
 	} else {
 	    me->state = S_squoted;
+	    if (UTF8_TTY_ISO2022JP) {
+		HTChunkPutc(string, '(');
+		goto top1;
+	    }
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_nonascii_text_sq:	/* Expecting CJK ESC after non-ASCII text. */
 	if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1281 */
 	    me->state = S_esc_sq;
 	}
-	HTChunkPutc(string, c);
+	if (UTF8_TTY_ISO2022JP) {
+	    if (TOASCII(c) != '\033')
+		HTChunkPutUtf8Char(string, clong);
+	} else
+	    HTChunkPutc(string, c);
 	break;
 
     case S_esc_dq:		/* Expecting '$'or '(' following CJK ESC. */
@@ -4052,8 +4129,11 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_paren_dq;
 	} else {
 	    me->state = S_dquoted;
+	    if (UTF8_TTY_ISO2022JP)
+		goto top1;
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_dollar_dq:		/* Expecting '@', 'B', 'A' or '(' after CJK "ESC$". */
@@ -4062,7 +4142,8 @@ static void SGML_character(HTStream *me, int c_in)
 	} else if (c == '(') {
 	    me->state = S_dollar_paren_dq;
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_dollar_paren_dq:	/* Expecting 'C' after CJK "ESC$(". */
@@ -4070,8 +4151,13 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_nonascii_text_dq;
 	} else {
 	    me->state = S_dquoted;
+	    if (UTF8_TTY_ISO2022JP) {
+		HTChunkPuts(string, "$(");
+		goto top1;
+	    }
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_paren_dq:		/* Expecting 'B', 'J', 'T' or 'I' after CJK "ESC(". */
@@ -4079,17 +4165,28 @@ static void SGML_character(HTStream *me, int c_in)
 	    me->state = S_dquoted;
 	} else if (c == 'I') {
 	    me->state = S_nonascii_text_dq;
+	    if (UTF8_TTY_ISO2022JP)
+		me->kanji_buf = '\t'; /* flag for single byte katakana */
 	} else {
 	    me->state = S_dquoted;
+	    if (UTF8_TTY_ISO2022JP) {
+		HTChunkPutc(string, '(');
+		goto top1;
+	    }
 	}
-	HTChunkPutc(string, c);
+	if (!UTF8_TTY_ISO2022JP)
+	    HTChunkPutc(string, c);
 	break;
 
     case S_nonascii_text_dq:	/* Expecting CJK ESC after non-ASCII text. */
 	if (TOASCII(c) == '\033') {	/* S/390 -- gil -- 1298 */
 	    me->state = S_esc_dq;
 	}
-	HTChunkPutc(string, c);
+	if (UTF8_TTY_ISO2022JP) {
+	    if (TOASCII(c) != '\033')
+		HTChunkPutUtf8Char(string, clong);
+	} else
+	    HTChunkPutc(string, c);
 	break;
 
     case S_junk_tag:
_______________________________________________
Lynx-dev mailing list
Lynx-dev@nongnu.org
https://lists.nongnu.org/mailman/listinfo/lynx-dev

Reply via email to