Package: lookup Version: 1.08b-10 Severity: wishlist Tags: patch Attached is a patch to add utf-8 output support to lookup. In order to avoid an iconv dependancy, I've instead generated two arrays (JIS X 208 and JIS X 212) with a perl script.
However, input support without iconv looks painful, so I thought I'd submit
this as it is for consideration.
If you want to include the lib/jisucs2tbl.h in the tarball (like commands.h)
then simply generate it once, and move lib/jisucs2tbl.h from the clean target
to the realclean targe.
Adding any more output encodings would be difficult, as we just used up all the
input encoding bitfields.
diff -ruN lookup-1.08b.orig/cmds.master lookup-1.08b/cmds.master
--- lookup-1.08b.orig/cmds.master 1996-07-19 01:10:36.000000000 +1000
+++ lookup-1.08b/cmds.master 2005-09-01 18:36:22.000000000 +1000
@@ -322,8 +322,8 @@
CMD_GENERAL|CMD_ENCODING_RELATED
report or set the output encoding-method
-output encoding [euc|sjis|jis|...]
-output> (encoding>)?
(euc|sjis|jis-?(78|83|90)?(-(ascii|roman))?)?((<(212|no212|hwk|nohwk|foldhwk|disp|nodisp|code|mark)>|[-,\s]+)*)
+output encoding [euc|sjis|utf8|jis|...]
+output> (encoding>)?
(euc|sjis|utf8|jis-?(78|83|90)?(-(ascii|roman))?)?((<(212|no212|hwk|nohwk|foldhwk|disp|nodisp|code|mark)>|[-,\s]+)*)
cmd_output_encoding(\2, \3, \5, \6)
CMD_GENERAL
diff -ruN lookup-1.08b.orig/commands.c lookup-1.08b/commands.c
--- lookup-1.08b.orig/commands.c 2005-09-01 18:48:12.000000000 +1000
+++ lookup-1.08b/commands.c 2005-09-01 18:32:14.000000000 +1000
@@ -1367,6 +1367,7 @@
default: soft_assert(0); break;
case SJIS_OUTPUT: output("sjis"); break;
case EUC_OUTPUT: output("euc"); break;
+ case UTF8_OUTPUT: output("utf8"); break;
case JIS_OUTPUT:
switch(output_style & _JIS_KANJI_STYLE)
@@ -1424,6 +1425,8 @@
(void)select_output_style(EUC_OUTPUT);
else if (main_style[0]=='s' || main_style[0]=='S')
(void)select_output_style(SJIS_OUTPUT);
+ else if (main_style[0]=='u' || main_style[0]=='U')
+ (void)select_output_style(UTF8_OUTPUT);
else if (main_style[0]=='j' || main_style[0]=='j')
{
if (!jis_year)
diff -ruN lookup-1.08b.orig/commands.h lookup-1.08b/commands.h
--- lookup-1.08b.orig/commands.h 1996-07-21 18:52:23.000000000 +1000
+++ lookup-1.08b/commands.h 2005-09-01 18:36:43.000000000 +1000
@@ -696,9 +696,9 @@
/* generated from "cmds.master" record at line 320*/
{
CMD_GENERAL|CMD_ENCODING_RELATED,
- (S)"output encoding [euc|sjis|jis|...]",
+ (S)"output encoding [euc|sjis|utf8|jis|...]",
(S)"report or set the output encoding-method",
-
(S)"^\\s*output>\\s*(encoding>)?\\s*(euc|sjis|jis-?(78|83|90)?(-(ascii|roman))?)?((<(212|no212|hwk|nohwk|foldhwk|disp|nodisp|code|mark)>|[-,\\s]+)*)\\s*$",
+
(S)"^\\s*output>\\s*(encoding>)?\\s*(euc|sjis|utf8|jis-?(78|83|90)?(-(ascii|roman))?)?((<(212|no212|hwk|nohwk|foldhwk|disp|nodisp|code|mark)>|[-,\\s]+)*)\\s*$",
_func41_,
},
diff -ruN lookup-1.08b.orig/lib/output.c lookup-1.08b/lib/output.c
--- lookup-1.08b.orig/lib/output.c 2005-09-01 18:48:12.000000000 +1000
+++ lookup-1.08b/lib/output.c 2005-09-01 18:35:27.000000000 +1000
@@ -509,6 +509,110 @@
return retval;
}
+#include "jisucs2tbl.h"
+
+/* We take the next character to print, and return the number
+ * of columns we've output. Bytes go to *output++
+ */
+static unsigned output_euc_as_utf8(unsigned char c)
+{
+ static unsigned char hi = 0;
+ static unsigned char mid = 0;
+ u_int16_t ucs2;
+ int width;
+
+ // EUC-JP can be one, two or three bytes
+ // First byte:
+ // 0x8e Halfwidth Kana (JIS X 0201 Kana) - 2 bytes HALF_WIDTH_KATA_HI
+ // Easy! UFF{Second byte - 0x40}
+ // 0x8f JIS X 0212 - 3 bytes THREE_BYTE_HI
+ // > 0xa0 JIS X 0208 - 2 bytes - Mapping table
+ // Otherwise, ASCII (JIS X 0201 Roman)
+
+// fprintf(stderr, "0x%02x (0x%02x) ", c, hi);
+ /* Catch incomplete characters */
+ switch (hi) {
+ case 0:
+ // New character time
+ if (c < 0xa0 && c != HALF_WIDTH_KATA_HI && c !=
THREE_BYTE_HI) {
+ // JIS X 0201 Roman
+ hi = 0;
+ mid = 0;
+ if (c != 0)
+ *nextout++ = c;
+ if ((c == '\n' && flush_on_newline) || nextout
>= bufend)
+ flush_raw_output();
+ // I'm sure there're better definitions of
printable than this...
+ return (c >= ' ') ? 1:0;
+ } else {
+ // Worry about it later
+ hi = c;
+ mid = 0;
+ return 0;
+ }
+ break;
+ case THREE_BYTE_HI:
+ // JIS X 0212
+ if (mid == 0) {
+ mid = c;
+ return 0;
+ }
+ break;
+ }
+
+ /* Completed multibyte characters */
+ switch(hi) {
+ case HALF_WIDTH_KATA_HI:
+ // JIS X 0201 Kana
+ ucs2 = 0xff00 + c - 0x40;
+ hi = 0;
+ mid = 0;
+ width = 1;
+ break;
+
+ case THREE_BYTE_HI:
+ // JIS X 0212
+ ucs2 = jis212[mid-0xa0][c-0xa0];
+ hi = 0;
+ mid = 0;
+ width = 2;
+ break;
+
+ default:
+ // JIS X 0208
+ ucs2 = jis208[hi-0xa0][c-0xa0];
+ hi = 0;
+ mid = 0;
+ width = 2;
+ break;
+ }
+
+ if (ucs2 < 0x80) {
+ *nextout++ = ucs2;
+ // If this happens, we've been translated down to ASCII.
+ if ((c == '\n' && flush_on_newline) || nextout >= bufend)
+ flush_raw_output();
+ return 1;
+ } else if (ucs2 < 0x800) {
+ *nextout++ = (0xc0 | (ucs2 >> 6));
+ *nextout++ = (0x80 | (ucs2 & 0x3f));
+ // If this happens, we're prolly back in single-width land
+ width = 1;
+ } else /* if (ucs2 < 0x10000)*/ {
+ *nextout++ = (0xe0 | (ucs2 >> 12));
+ *nextout++ = (0x80 | ((ucs2 >> 6) & 0x3f));
+ *nextout++ = (0x80 | (ucs2 & 0x3f));
+ /*
+ } else {
+ // Should never get here, EUC-JP doens't translate outside BMP.
+ return 0;
+ */
+ }
+ if (nextout >= bufend)
+ flush_raw_output();
+ return width;
+}
+
/***************************************************************/
unsigned (*_output_char_function)(unsigned char) = output_euc_as_jis;
@@ -986,6 +1090,9 @@
break;
}
break;
+ case UTF8_OUTPUT:
+ *function_pointer = output_euc_as_utf8;
+ break;
}
if ((output_style & _KATAKANA) != PASS_HW_KATANANA)
@@ -1019,6 +1126,7 @@
case JIS_ASCII: output(", ASCII)"); break;
}
break;
+ case UTF8_OUTPUT: output("UTF-8"); break;
}
switch (output_style & _0212_1990)
diff -ruN lookup-1.08b.orig/lib/output.h lookup-1.08b/lib/output.h
--- lookup-1.08b.orig/lib/output.h 1996-01-16 04:50:39.000000000 +1100
+++ lookup-1.08b/lib/output.h 2005-09-01 13:48:34.000000000 +1000
@@ -61,7 +61,8 @@
#define EUC_OUTPUT 0x00000001
#define SJIS_OUTPUT 0x00000002
#define JIS_OUTPUT 0x00000004
-#define _BASIC_OUTPUT_TYPE (JIS_OUTPUT|SJIS_OUTPUT|EUC_OUTPUT)
+#define UTF8_OUTPUT 0x00000008
+#define _BASIC_OUTPUT_TYPE (JIS_OUTPUT|SJIS_OUTPUT|EUC_OUTPUT|UTF8_OUTPUT)
#define JIS_1978_OUTPUT JIS_OUTPUT
#define JIS_1983_OUTPUT JIS_OUTPUT|0x00000010
diff -ruN lookup-1.08b.orig/Makefile lookup-1.08b/Makefile
--- lookup-1.08b.orig/Makefile 2005-09-01 18:48:12.000000000 +1000
+++ lookup-1.08b/Makefile 2005-09-01 18:56:15.000000000 +1000
@@ -175,6 +175,9 @@
-echo '#endif /* file wrapper */' >> tmp;
mv tmp lib/system.h
+lib/jisucs2tbl.h: /usr/share/i18n/charmaps/EUC-JP.gz
+ zcat /usr/share/i18n/charmaps/EUC-JP.gz | ./mkeucucs2tbl.pl >
lib/jisucs2tbl.h
+
make.sh: realclean
@echo ':# script to make lookup' > tmp
@echo '## Can set CC= and CFLAGS= on the command line, just as with
make' >> tmp
@@ -219,7 +222,7 @@
clean: tidy
[EMAIL PROTECTED] dummy > dummy.o
- /bin/rm -f \#* *.o *.d doc/#* $(LOCAL_LIB) lib/system.h
+ /bin/rm -f \#* *.o *.d doc/#* $(LOCAL_LIB) lib/system.h lib/jisucs2tbl.h
realclean: clean
[EMAIL PROTECTED] dummy > lookup.man.xxx
@@ -261,7 +264,7 @@
lib/virtfile.h lib/romaji2kana.h lib/jregex.h lib/strsave.h \
lib/replace.h lib/input.h lookup.h lib/jreadline.h
output.o: lib/output.c lib/config.h lib/assert.h lib/input.h \
- lib/output.h
+ lib/output.h lib/jisucs2tbl.h
replace.o: lib/replace.c lib/config.h lib/assert.h lib/jregex.h \
lib/xmalloc.h lib/replace.h
romaji2kana.o: lib/romaji2kana.c lib/config.h lib/assert.h \
diff -ruN lookup-1.08b.orig/mkeucucs2tbl.pl lookup-1.08b/mkeucucs2tbl.pl
--- lookup-1.08b.orig/mkeucucs2tbl.pl 1970-01-01 10:00:00.000000000 +1000
+++ lookup-1.08b/mkeucucs2tbl.pl 2005-09-01 18:25:32.000000000 +1000
@@ -0,0 +1,63 @@
+#! /usr/bin/perl
+
+# Handy reference: http://czyborra.com/utf/
+# Expects zcat /usr/share/i18n/charmaps/EUC-JP.gz as input
+
+while ( <> ) {
+ last if /JIS X 0208/;
+}
+
+my @jis208;
+while ( <> ) {
+ last if /JIS X 0212/;
+ next unless m#^<U(....)>\s+/x(..)/x(..)#;
+ my ($ucs, $row, $col) = (hex($1), hex($2)-0xa0, hex($3)-0xa0);
+ $jis208[$row][$col] = $ucs;
+# print "Character 0x$ucs is at JIS X 0208 row $row, col $col\n";
+}
+
+my @jis212;
+while ( <> ) {
+ next unless m#^<U(....)>\s+/x8f/x(..)/x(..)#;
+ my ($ucs, $row, $col) = (hex($1), hex($2)-0xa0, hex($3)-0xa0);
+ $jis212[$row][$col] = $ucs;
+# print "Character 0x$ucs is at JIS X 0212 row $row, col $col\n";
+}
+
+print "// JIS X 208: ".scalar(@jis208)." rows\n";
+
+print "static const u_int32_t jis208[", 0xfe - 0xa0 + 1 ,"][", 0xfe-0xa0 + 1
,"] = {\n";
+for my $i (0 .. 0xfe-0xa0) {
+ print "\t{";
+ for my $j (0 .. 0xfe-0xa0) {
+ if (defined $jis208[$i][$j]) {
+ printf "0x%04x", $jis208[$i][$j];
+ } else {
+ print "0x0000";
+ }
+ print ", " unless $j == 0xfe-0xa0;
+ }
+ print "}";
+ print ", " unless $i == 0xfe-0xa0;
+ print "\n";
+}
+print "};\n";
+
+print "// JIS X 212: ".scalar(@jis212)." rows\n";
+
+print "static const u_int32_t jis212[", 0xfe - 0xa0 + 1 ,"][", 0xfe-0xa0 + 1
,"] = {\n";
+for my $i (0 .. 0xfe-0xa0) {
+ print "\t{";
+ for my $j (0 .. 0xfe-0xa0) {
+ if (defined $jis212[$i][$j]) {
+ printf "0x%04x", $jis212[$i][$j];
+ } else {
+ print "0x0000";
+ }
+ print ", " unless $j == 0xfe-0xa0;
+ }
+ print "}";
+ print ", " unless $i == 0xfe-0xa0;
+ print "\n";
+}
+print "};\n";
-- System Information:
Debian Release: testing/unstable
APT prefers unstable
APT policy: (990, 'unstable'), (950, 'unstable'), (900, 'experimental')
Architecture: i386 (i686)
Shell: /bin/sh linked to /bin/bash
Kernel: Linux 2.6.12
Locale: LANG=en_AU.UTF-8, LC_CTYPE=en_AU.UTF-8 (charmap=UTF-8)
Versions of packages lookup depends on:
ii libc6 2.3.5-6 GNU C Library: Shared libraries an
lookup recommends no packages.
-- no debconf information
--
Paul "TBBle" Hampson, [EMAIL PROTECTED]
8th year CompSci/Asian Studies student, ANU
Shorter .sig for a more eco-friendly paperless office.
pgpgYlpjCZKzX.pgp
Description: PGP signature

