> BTW, attached is the file gpreconv [...] Since it possible that some mailers don't accept attachments, I resend it here, directly embedded in the email.
Werner ====================================================================== #define I18N #include <stdio.h> #include <errno.h> #include <unistd.h> #include <string.h> #ifdef I18N #include <locale.h> #include <langinfo.h> #include <iconv.h> #endif /* I18N */ #define MAX_VAR_LEN 100 char *check_encoding_tag(char *); char *check_encoding_tag_parse_tag(char *, char **, char **); char *read_file (FILE *); char *emacs2mime(char *); char *mime2locale(char *); void conversion_latin1 (char *, char *); void conversion_utf8 (char *, char *); void conversion_cp1047 (char *, char *); void conversion_iconv (char *, char *); void help (char *); typedef struct { char * from; char * to; } encname_t; encname_t emacs_to_mime[] = { {"alternativnyj", ""}, {"big5", "Big5"}, {"chinese-big5", "Big5"}, {"chinese-euc", "EUC-CN"}, {"chinese-hz", "HZ-GB-2312"}, {"chinese-iso-7bit", ""}, {"chinese-iso-8bit", ""}, {"chinese-iso-8bit-with-esc", ""}, {"cn-big5", "Big5"}, {"cn-gb-2312", "GB2312"}, {"compound-text", ""}, {"ctext", ""}, {"cyrillic-alternativnyj", ""}, {"cyrillic-iso-8bit", "ISO-8859-5"}, {"cyrillic-iso-8bit-with-esc", ""}, {"cyrillic-koi8", "KOI8-R"}, {"dos", ""}, {"emacs-mule", ""}, {"euc-china", "EUC-CN"}, {"euc-cn", "EUC-CN"}, {"euc-japan", "EUC-JP"}, {"euc-jisx0213", "EUC-JP"}, {"euc-jisx0213-with-esc", "EUC-JP"}, {"euc-jp", "EUC-JP"}, {"euc-korea", "EUC-KR"}, {"euc-kr", "EUC-KR"}, {"gb2312", "GB2312"}, {"greek-iso-8bit", "ISO-8859-7"}, {"greek-iso-8bit-with-esc", "ISO-8859-7"}, {"hebrew-iso-8bit", "ISO-8859-8"}, {"hebrew-iso-8bit-with-esc", "ISO-8859-8"}, {"hz", "HZ-GB-2312"}, {"hz-gb-2312", "HZ-GB-2312"}, {"in-is13194-devanagari", ""}, {"in-is13194-devanagari-with-esc", ""}, {"iso-2022-7", ""}, {"iso-2022-7bit", ""}, {"iso-2022-7bit-lock", ""}, {"iso-2022-7bit-lock-ss2", ""}, {"iso-2022-7bit-ss2", ""}, {"iso-2022-8", ""}, {"iso-2022-8bit", ""}, {"iso-2022-8bit-lock" , ""}, {"iso-2022-8bit-lock-ss2", ""}, {"iso-2022-8bit-ss2", ""}, {"iso-2022-cjk", ""}, {"iso-2022-cn", "ISO-2022-CN"}, {"iso-2022-cn-ext", "ISO-2022-CN-EXT"}, {"iso-2022-int-1", ""}, {"iso-2022-jp", "ISO-2022-JP"}, {"iso-2022-jp-1978-irv", "ISO-2022-JP"}, {"iso-2022-jp-2", "ISO-2022-JP-2"}, {"iso-2022-jp-3", ""}, {"iso-2022-jp-3-compatible", ""}, {"iso-2022-jp-3-strict", ""}, {"iso-2022-kr", "ISO-2022-KR"}, {"iso-2022-lock", ""}, {"iso-8859-1", "ISO-8859-1"}, {"iso-8859-2", "ISO-8859-2"}, {"iso-8859-3", "ISO-8859-3"}, {"iso-8859-4", "ISO-8859-4"}, {"iso-8859-5", "ISO-8859-5"}, {"iso-8859-6", "ISO-8859-6"}, {"iso-8859-7", "ISO-8859-7"}, {"iso-8859-8", "ISO-8859-8"}, {"iso-8859-9", "ISO-8859-9"}, {"iso-latin-1", "ISO-8859-1"}, {"iso-latin-2", "ISO-8859-2"}, {"iso-latin-3", "ISO-8859-3"}, {"iso-latin-4", "ISO-8859-4"}, {"iso-latin-5", "ISO-8859-9"}, {"iso-safe", ""}, {"japanese-iso-7bit-1978-irv", "ISO-2022-JP"}, {"japanese-iso-8bit", ""}, {"japanese-iso-8bit-with-esc", ""}, {"japanese-euc", "EUC-JP"}, {"japanese-shift-jis", "Shift_JIS"}, {"japanese-shift-jisx0213", ""}, {"junet", "ISO-2022-JP"}, {"koi8", "KOI8-R"}, {"koi8-r", "KOI8-R"}, {"korean-euc", "EUC-KR"}, {"korean-iso-7bit-lock", "ISO-2022-KR"}, {"korean-iso-8bit", ""}, {"korean-iso-8bit-with-esc", ""}, {"lao", ""}, {"lao-with-esc", ""}, {"latin-1", "ISO-8859-1"}, {"latin-2", "ISO-8859-2"}, {"latin-3", "ISO-8859-3"}, {"latin-4", "ISO-8859-4"}, {"latin-5", "ISO-8859-9"}, {"mac", ""}, {"old-jis", "ISO-2022-JP"}, {"raw-text", ""}, {"shift_jis", "Shift_JIS"}, {"shift_jisx0213", "Shift_JIS"}, {"sjis", "Shift_JIS"}, {"th-tis620", "TIS-620"}, {"thai-tis620", "TIS-620"}, {"tibetan", ""}, {"tis-620", "TIS-620"}, {"tis620", "TIS-620"}, {"us-ascii", "US-ASCII"}, {"utf-16-be", "UTF-16BE"}, {"utf-16-be-no-signature", "UTF-16BE"}, {"utf-16-le", "UTF-16LE"}, {"utf-16-le-no-signature", "UTF-16LE"}, {"utf-7", "UTF-7"}, {"utf-7-safe", "UTF-7"}, {"utf-8", "UTF-8"}, {"utf-8-ws", "UTF-8"}, {"vietnamese-viqr", "VIQR"}, {"vietnamese-viscii", "VISCII"}, {"vietnamese-vscii", "VISCII"}, {"viqr", "VIQR"}, {"viscii", "VISCII"}, {"vscii", "VSCII"}, {"x-ctext", ""}, {NULL, NULL} }; encname_t mime_to_locale[] = { {NULL, NULL} }; main(int argc, char **argv) { char *encoding=NULL, *default_encoding, *inbuf, *locale; FILE *fp; /* determine the default encoding. This part has to be located * before getopt() since the help message shows the default * encoding. */ #ifdef I18N setlocale(LC_ALL, ""); locale = setlocale(LC_CTYPE, NULL); if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX")) { default_encoding = "latin1"; } else { default_encoding = nl_langinfo(CODESET); if (!default_encoding) default_encoding = "latin1"; } #else default_encoding = "latin1"; #endif /* parse the command option */ while(1){ int opt; opt = getopt(argc, argv, "e:h"); if (opt == -1) break; switch(opt){ case 'e': encoding = (char *)strdup(optarg); break; case 'h': help(default_encoding); exit(0); default: exit(1); } } /* read a source */ if (optind < argc) { fp = fopen(argv[optind], "r"); if (!fp) { printf("Cannot open %s\n", argv[optind]); exit(1); } inbuf = read_file(fp); fclose(fp); } else { inbuf = read_file(stdin); } /* finally determine the encoding */ if (encoding == NULL) { encoding = check_encoding_tag(inbuf); if (encoding == NULL) { encoding = default_encoding; } } /* translate from MIME & Emacs encoding names to locale encoding names */ encoding = emacs2mime(encoding); encoding = mime2locale(encoding); /* call converter (converters write to stdout) */ if (!strcasecmp(encoding, "latin1")) { conversion_latin1(inbuf, encoding); } else if (!strcasecmp(encoding, "utf8")) { conversion_utf8(inbuf, encoding); } else if (!strcasecmp(encoding, "cp1047")) { conversion_cp1047(inbuf, encoding); } else { #ifdef I18N conversion_iconv(inbuf, encoding); #else printf("Conversion from %s to UTF-8 is not supported.\n", encoding); exit(1); #endif } } /* --------------------------------------------------------- * print help message * --------------------------------------------------------- */ void help(char *default_encoding) { printf( "Preprocessor for Groff system (%s)\n" "Usage: gpreconv [option] [input file]\n" " -e encoding specify encoding\n" " -h this message\n" "The default encoding is \"%s\".\n", #ifdef I18N "internationalized version", #else "non-internationalized version", #endif default_encoding); } /* --------------------------------------------------------- * read input file * The file has to be inputed as a whole before conversion * since the encoding may be stateful like ISO-2022 series. * --------------------------------------------------------- */ char *read_file (FILE *fp) { #define READBUF_SIZE 32768 char *buf = NULL; size_t bufsize = 0; size_t readsize = 0; size_t n; while(1) { if (readsize == bufsize) { bufsize += READBUF_SIZE; buf = (char *)realloc(buf, bufsize + 1); if (!buf) { printf("Unable to allocate memory.\n"); exit(1); } } n = fread (buf+readsize, 1, bufsize-readsize, fp); readsize += n; if (feof(fp)) break; if (ferror(fp)) { printf("Read error.\n"); exit(1); } } buf[readsize] = 0; return buf; } /* --------------------------------------------------------- * check encoding tag in the read buffer (not implemented yet) * --------------------------------------------------------- */ char *check_encoding_tag(char *inbuf) { char *p, *lineend, *d1, *d2, *variable, *value; for (p=inbuf ; !strncmp(p, ".\\\"", 3) ; p = lineend + 1) { if ((lineend = strchr(p, '\n')) == NULL) break; *lineend = 0; d1 = strstr(p, "-*-"); if (d1) d2 = strstr(d1+3, "-*-"); *lineend = '\n'; if (!d1 || !d2) continue; *d2 = 0; d1+=3; while(*d1) { d1 = check_encoding_tag_parse_tag(d1, &variable, &value); if (!strcasecmp(variable, "coding")) { *d2 = '-'; return value; } } *d2 = '-'; } return NULL; } char *check_encoding_tag_parse_tag(char *d1, char **variable, char **value) { static char var[MAX_VAR_LEN], val[MAX_VAR_LEN]; int l; *variable = var; *value = val; while (*d1 == ' ' || *d1 == '\t') d1++; l = 0; while (l<MAX_VAR_LEN-1 && *d1 && !strchr(";: \t", *d1)) { var[l++] = *(d1++); } var[l] = 0; while (*d1 && *d1!=':' && *d1!=';') d1++; val[0] = 0; if (!*d1) return d1; if (*d1 == ';') return d1+1; d1++; while (*d1 == ' ' || *d1 == '\t') d1++; l = 0; while (l<MAX_VAR_LEN-1 && *d1 && !strchr("; \t", *d1)) { val[l++] = *(d1++); } val[l] = 0; while (*d1 && *d1!=';') d1++; if (!*d1) return d1; if (*d1 == ';') return d1+1; } /* --------------------------------------------------------- * convert encoding name from emacs to mime * --------------------------------------------------------- */ char *emacs2mime(char *emacs_encoding) { static char emacs_enc[MAX_VAR_LEN]; int emacs_enc_len; encname_t *table; strncpy(emacs_enc, emacs_encoding, MAX_VAR_LEN-1); emacs_enc[MAX_VAR_LEN-1] = 0; emacs_enc_len = strlen(emacs_enc); if (!strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos")) emacs_enc[emacs_enc_len - 4] = 0; if (!strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac")) emacs_enc[emacs_enc_len - 4] = 0; if (!strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix")) emacs_enc[emacs_enc_len - 5] = 0; for (table = emacs_to_mime; table->from; table++) { if (!strcasecmp(emacs_enc, table->from)) return table->to; } return emacs_enc; } /* --------------------------------------------------------- * convert encoding name from mime to locale * --------------------------------------------------------- */ char *mime2locale(char *mime_encoding) { encname_t *table; for (table = mime_to_locale; table->from; table++) { if (!strcasecmp(mime_encoding, table->from)) return table->to; } return mime_encoding; } /* --------------------------------------------------------- * conversion functions * --------------------------------------------------------- */ /* conversion from ISO-8859-1 (aka Latin-1) to UTF-8 */ void conversion_latin1 (char *inbuf, char *encoding) { unsigned char *p; for(p=inbuf; *p; p++) { if (*p < 0x80) putchar(*p); else {putchar(0xc0 + (*p >> 6)); putchar(0x80 + (*p & 0x3f));} } return; } /* conversion from UTF-8 to UTF-8, i.e., do nothing */ void conversion_utf8 (char *inbuf, char *encoding) { fwrite(inbuf, 1, strlen(inbuf), stdout); return; } /* conversion from CP1047 (EBCDIC) to UTF-8 */ /* the table is made from /font/devcp1047/R.proto in groff 1.16 */ void conversion_cp1047 (char *inbuf, char *encoding) { static unsigned char cp1047[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5, /* 0x40-0x47 */ 0xe7, 0xf1, 0xa2, '.' , '<' , '(' , '+' , '|' , /* 0x48-0x4f */ '&' , 0xe8, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef, /* 0x50-0x57 */ 0xec, 0xdf, '!' , '$' , '*' , ')' , ';' , '^' , /* 0x58-0x5f */ '-' , '/' , 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, /* 0x60-0x67 */ 0xc7, 0xd1, 0xa6, ',' , '%' , '_' , '>' , '?' , /* 0x68-0x6f */ 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf, /* 0x70-0x77 */ 0xcc, '`' , ':' , '#' , '@' , '\'', '=' , '\"', /* 0x78-0x7f */ 0xd8, 'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , /* 0x80-0x87 */ 'h' , 'i' , 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1, /* 0x88-0x8f */ 0xb0, 'j' , 'k' , 'l' , 'm' , 'n' , 'o' , 'p' , /* 0x90-0x97 */ 'q' , 'r' , 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4, /* 0x98-0x9f */ 0xb5, '~' , 's' , 't' , 'u' , 'v' , 'w' , 'x' , /* 0xa0-0xa7 */ 'y' , 'z' , 0xa1, 0xbf, 0xd0, '[' , 0xde, 0xae, /* 0xa8-0xaf */ 0xac, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc, /* 0xb0-0xb7 */ 0xbd, 0xbe, 0xdd, 0xa8, 0xaf, ']' , 0xb4, 0xd7, /* 0xb8-0xbf */ '{' , 'A' , 'B' , 'C' , 'D' , 'E' , 'F' , 'G' , /* 0xc0-0xc7 */ 'H' , 'I' , 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5, /* 0xc8-0xcf */ '}' , 'J' , 'K' , 'L' , 'M' , 'N' , 'O' , 'P' , /* 0xd0-0xd7 */ 'Q' , 'R' , 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff, /* 0xd8-0xdf */ '\\', 0xf7, 'S' , 'T' , 'U' , 'V' , 'W' , 'X' , /* 0xe0-0xe7 */ 'Y' , 'Z' , 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5, /* 0xe8-0xef */ '0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' , /* 0xf0-0xf7 */ '8' , '9' , 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x00 /* 0xf8-0xff */ }; unsigned char *p, c; for(p=inbuf; *p; p++) { c = cp1047[*p]; if (c == 0) c = *p; /* fail safe */ if (c < 0x80) putchar(c); else { putchar(0xc0 + (c >> 6)); putchar(0x80 + (c & 0x3f)); } } return; } /* locale-sensible conversion */ #ifdef I18N void conversion_iconv (char *inbuf, char *encoding) { #define OUTBUF_SIZE 32768 #define OUTBUF_LIMIT 10 char *outbuf = NULL, *outbuf_top = NULL; size_t bufsize = 0; size_t inbytesleft, outbytesleft, status; iconv_t handle; handle = iconv_open("UTF-8", encoding); if (handle == (iconv_t)-1) { if (errno == EINVAL) { printf("Conversion from %s to UTF-8 is not supported.\n", encoding); exit(1); } printf("iconv_open failed!\n"); exit(1); } inbytesleft = strlen(inbuf) + 1; outbytesleft = 0; while (inbytesleft > 0) { if (outbytesleft < OUTBUF_LIMIT) { size_t outsize = outbuf - outbuf_top; outbuf_top = (char *)realloc(outbuf_top, bufsize+=OUTBUF_SIZE); if (!outbuf_top) { printf("Unable to allocate memory.\n"); exit(1); } outbuf = outbuf_top + outsize; outbytesleft += OUTBUF_SIZE; } status = iconv(handle, &inbuf, &inbytesleft, &outbuf, &outbytesleft); if (status == -1) { if (errno == EINVAL || errno == EILSEQ) { printf("Invalid character.\n"); exit(1); } } } fwrite(outbuf_top, 1, strlen(outbuf_top), stdout); return; } #endif _______________________________________________ Groff mailing list Groff@gnu.org http://lists.gnu.org/mailman/listinfo/groff