Robert Brady wrote on 2000-07-02 17:41 UTC:
> * charclass-table.h automatically generated at compile timefrom another
> file, which is a lot smaller. this second file charclass-table.t will
> hopefully be small enough to go in the tarball. if another version
> of the standard comes out, it itself can be regenerated from
> UnicodeData.txt
This is only needed for double-click left-mouse word selection and
therefore not performance critical at all. I don't think, wasting 128 kB
RAM for a 2^16 entry table can be justified here. I have just written an
alternative implementation which I attach below. It maintains 100%
backwards compatibility with the existing charclass mechanism and
extends it to UCS-4 without any huge data structures. I'd guess, Thomas
might feel much more comfortable with your patch if you used this simple
solution instead of the huge tables.
> * precompose.h still icky.
I think, this should best be made via a binary search on a sorted array,
similar to the code in keysym2ucs.c.
Markus
--
Markus G. Kuhn, Computer Laboratory, University of Cambridge, UK
Email: mkuhn at acm.org, WWW: <http://www.cl.cam.ac.uk/~mgk25/>
/*
* Compact and efficient reimplementation of the
* xterm character class mechanism for large character sets
*
* Markus Kuhn -- [EMAIL PROTECTED] -- 2000-07-03
*
* Xterm allows users to select entire words with a double-click on
* the left mouse button. Opinions might differ on what type of
* characters are part of separate words, therefore xterm allows users
* to configure a class code for each 8-bit character. Words are
* maximum length sequences of neighboring characters with identical
* class code. Extending this mechanism to Unicode naively would
* create an at least 2^16 entries (128 kB) long class code table.
* Instead, we transform the character class table into a list
* of intervals, that will be accessed via a linear search.
* Changes made to the table by the user will be appended. A special
* class code -1 (default) marks characters who have their code number
* as the class code. We could alternatively use a sorted table of
* non-overlapping intervals that can be accessed via binary search,
* but merging in new intervals is significantly more hassle and
* not worth the effort here.
*/
#include <stdlib.h>
#include <stdio.h>
struct classentry {
int class;
int first;
int last;
} *classtab;
/*
* Special convention for classtab[0]:
* - classtab[0].class is the allocated number of entries in classtab
* - classtab[0].first = 1 (first used entry in classtab)
* - classtab[0].last is the last used entry in classtab
*/
int SetCharacterClassRange(int low, int high, int value)
{
if (high < low)
return -1; /* nothing to do */
/* make sure we have at least one free entry left at table end */
if (classtab[0].last > classtab[0].class - 2) {
classtab[0].class += 5 + classtab[0].class/4;
classtab = realloc(classtab,
classtab[0].class * sizeof(struct classentry));
if (!classtab)
abort();
}
/* simply append new interval to end of interval array */
classtab[0].last++;
classtab[classtab[0].last].first = low;
classtab[classtab[0].last].last = high;
classtab[classtab[0].last].class = value;
return 0;
}
void init_classtab(void)
{
const int size = 50;
classtab = (struct classentry *) malloc(size * sizeof(struct classentry));
if (!classtab)
abort();
classtab[0].class = size;
classtab[0].first = 1;
classtab[0].last = 0;
/* old xterm default classes */
SetCharacterClassRange(0, 0, 32);
SetCharacterClassRange(1, 31, 1);
SetCharacterClassRange('\t', '\t', 32);
SetCharacterClassRange('0', '9', 48);
SetCharacterClassRange('A', 'Z', 48);
SetCharacterClassRange('_', '_', 48);
SetCharacterClassRange('a', 'z', 48);
SetCharacterClassRange(127, 159, 1);
SetCharacterClassRange(160, 191, -1);
SetCharacterClassRange(192, 255, 48);
SetCharacterClassRange(215, 215, 216);
SetCharacterClassRange(247, 247, 248);
/* added Unicode classes */
SetCharacterClassRange(0x0100, 0xffdf, 48); /* mostly characters */
SetCharacterClassRange(0x037e, 0x037e, -1); /* Greek question mark */
SetCharacterClassRange(0x0387, 0x0387, -1); /* Greek ano teleia */
SetCharacterClassRange(0x055a, 0x055f, -1); /* Armenian punctuation */
SetCharacterClassRange(0x0589, 0x0589, -1); /* Armenian full stop */
SetCharacterClassRange(0x0700, 0x070d, -1); /* Syriac punctuation */
SetCharacterClassRange(0x104a, 0x104f, -1); /* Myanmar punctuation */
SetCharacterClassRange(0x10fb, 0x10fb, -1); /* Georgian punctuation */
SetCharacterClassRange(0x1361, 0x1368, -1); /* Ethiopic punctuation */
SetCharacterClassRange(0x166d, 0x166e, -1); /* Canadian Syl. punctuation */
SetCharacterClassRange(0x17d4, 0x17dc, -1); /* Khmer punctuation */
SetCharacterClassRange(0x1800, 0x180a, -1); /* Mongolian punctuation */
SetCharacterClassRange(0x2000, 0x200a, 32); /* spaces */
SetCharacterClassRange(0x200b, 0x27ff, -1); /* punctuation and symbols */
SetCharacterClassRange(0x2070, 0x207f, 0x2070); /* superscript */
SetCharacterClassRange(0x2080, 0x208f, 0x2080); /* subscript */
SetCharacterClassRange(0x3000, 0x3000, 32); /* ideographic space */
SetCharacterClassRange(0x3001, 0x3020, -1); /* ideographic punctuation */
SetCharacterClassRange(0xfe30, 0xfe6b, -1); /* punctuation forms */
SetCharacterClassRange(0xff00, 0xff0f, -1); /* half/fullwidth ASCII */
SetCharacterClassRange(0xff1a, 0xff20, -1); /* half/fullwidth ASCII */
SetCharacterClassRange(0xff3b, 0xff40, -1); /* half/fullwidth ASCII */
SetCharacterClassRange(0xff5b, 0xff64, -1); /* half/fullwidth ASCII */
return;
}
int CharacterClass(int c)
{
int i, class = -1;
for (i = classtab[0].first; i <= classtab[0].last; i++)
if (classtab[i].first <= c && classtab[i].last >= c)
class = classtab[i].class;
if (class < 0)
class = c;
return class;
}
/* a quick test */
int main()
{
int i;
/* first setup table */
init_classtab();
/* then print class codes of Latin-1 for comparison
* with the man page */
for (i = 0; i < 256; i++) {
printf("%4d,", CharacterClass(i));
if (i % 8 == 7)
puts("");
}
return 0;
}