The bellow is a first-aid bug fix
we define read function for utf-8
/* read until utf boundary */
int
readu(int fd, char *buf, int n)
{
static char b[3];
static int nb;
int m;
char *s, *e;
if(nb)
memcpy(buf, b, nb);
m = read(fd, buf + nb, n - nb);
/*
01. x in [00000000.0bbbbbbb] → 0bbbbbbb
10. x in [00000bbb.bbbbbbbb] → 110bbbbb, 10bbbbbb
11. x in [bbbbbbbb.bbbbbbbb] → 1110bbbb, 10bbbbbb,10bbbbbb
*/
e = buf + m + nb;
for(s = buf; s < e; s++){
if((*s & 0x80) == 0)
continue;
if((*s & 0xe0) == 0xd0){
s++;
continue;
}
/* then *s is 111bbbbb */
if(s+2 >= e)
break;
s += 2;
continue;
}
/* we have e - s bytes in s */
nb = e - s;
memcpy(b, s, nb);
return s - buf;
}
and replace 'read' by 'readu' in utf.c
utf_in(int fd, long *notused, struct convert *out)
{
...
while((n = readu(fd, buf+tot, N-tot)) >= 0){
...
}
Kenji Arisawa