On Tue, 26 Jun 2007 at 12:35, Artur Litwinowicz wrote:
> O I think it is not good idea to change the code of OpenBSD by me. 
> Years ago I was coding in C++ (at the University but with best mark ;). Now
> I am working for Oracle Corp. (PL/SQL and etc.) and I am a little out of
> practice ;) with C/C++.
> 
> Maybe someone core OpenBSD Developer will agree with me, that the extended
> COLLATION in OpenBSD will be the strong point in the system functionality ?
> 

I had the same problem with PostgreSQL on OpenBSD a few years ago and
I've written some C-language extensions (database functions).  Be
advised that this is a totally non-standard solution.  Here you go:

The idea was to make simple functions for sorting and upper/lower
conversion, that would be faster than typical unicode table lookups.
Since I only ever needed a single language at a time, I didn't need
all the fancy unicode stuff.

In "lang.h", there are lower and upper alphabet strings for each
language (currently english and slovenian).  You can add your own, of
course.

If you run "make" (you may need to edit the Makefile first), you'll
get the "hash_en.so", "hash_sl.so", "upper_en.so", "upper_sl.so",
"lower_en.so" and "lower_sl.so" shared libs containing postgres
functions with the same names.

To load them into database, use
    CREATE FUNCTION func_name(TEXT) RETURNS TEXT AS \
     'path/to/func_file.so', 'func_name' LANGUAGE C IMMUTABLE STRICT
for each of them.

The upper_XX and lower_XX functions return the upper/lowercase version
of the input string.  The hash_XX function replaces the input string
with new string where each letter is replaced with its position in the
alphabet.

For example, instead of
    SELECT ... ORDER BY my_column
you can use
    SELECT ... ORDER BY hash_sl(my_column)
and you've got slovenian sort order.

For performance, create an index on hash_sl(my_column), not my_column.

This will only work on "unicode" databases.

Of course, there may be bugs.  They are quite likely, actually.  I
remember I wrote all this in a hurry.  But it has worked OK for at
least three projects now.

All the files except for "lang.h" follow below.  For "lang.h", go to
"http://www.komna.com/tin/lang.h"; (it's UTF-8 encoded, so I can't put
it here inline).



# Makefile
#
# Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#

INCLUDE_DIR=/usr/local/include/postgresql/server

build: hash_en.so hash_sl.so lower_en.so lower_sl.so upper_en.so upper_sl.so

hash_en.so: hash.c
        cc -Wall -Werror -fpic -c -o hash_en.o hash.c -I$(INCLUDE_DIR) -DEN
        ld -Bshareable -o hash_en.so hash_en.o

hash_sl.so: hash.c
        cc -Wall -Werror -fpic -c -o hash_sl.o hash.c -I$(INCLUDE_DIR) -DSL
        ld -Bshareable -o hash_sl.so hash_sl.o

lower_en.so: lower.c
        cc -Wall -Werror -fpic -c -o lower_en.o lower.c -I$(INCLUDE_DIR) -DEN
        ld -Bshareable -o lower_en.so lower_en.o

lower_sl.so: lower.c
        cc -Wall -Werror -fpic -c -o lower_sl.o lower.c -I$(INCLUDE_DIR) -DSL
        ld -Bshareable -o lower_sl.so lower_sl.o

upper_en.so: upper.c
        cc -Wall -Werror -fpic -c -o upper_en.o upper.c -I$(INCLUDE_DIR) -DEN
        ld -Bshareable -o upper_en.so upper_en.o

upper_sl.so: upper.c
        cc -Wall -Werror -fpic -c -o upper_sl.o upper.c -I$(INCLUDE_DIR) -DSL
        ld -Bshareable -o upper_sl.so upper_sl.o

clean:
        -rm *.o *.so



/*
 * $Id: utf8.h,v 1.1.1.1 2004/12/14 14:53:28 tin Exp $
 *
 * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <sys/types.h>

typedef u_int32_t widechar_t;

#define STR_WCHAR_UNKNOWN ((widechar_t)'?')


/* read a wide char from UTF-8 string *ptr and point *ptr to the next one */
widechar_t
utf8get(char **ptr)
{
        unsigned char uch;
        widechar_t res;

        uch = (unsigned char)(**ptr);
        (*ptr)++;

        if (!(uch & 0x80))
                return((widechar_t)uch);

        if ((uch & 0xFE) == 0xC0)
                return(STR_WCHAR_UNKNOWN);

        if (((uch & 0xFF) == 0xE0) && ((**ptr & 0xE0) == 0x80))
                return(STR_WCHAR_UNKNOWN);

        if (((uch & 0xFF) == 0xF0) && ((**ptr & 0xF0) == 0x80))
                return(STR_WCHAR_UNKNOWN);

        if (((uch & 0xFF) == 0xF8) && ((**ptr & 0xF8) == 0x80))
                return(STR_WCHAR_UNKNOWN);

        if (((uch & 0xFF) == 0xFC) && ((**ptr & 0xFC) == 0x80))
                return(STR_WCHAR_UNKNOWN);
        
        if ((**ptr & 0xC0) != 0x80)
                return(STR_WCHAR_UNKNOWN);

        res = (widechar_t)((unsigned char)*((*ptr)++)) & 0x0000003F;
        if(uch & 0x20) {
                if((**ptr & 0xC0) != 0x80) {
                        (*ptr)--;
                        return(STR_WCHAR_UNKNOWN);
                }
                res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) &
                    0x0000003F);
        } else if((uch & 0xE0) != 0xC0) {
                (*ptr)-=2;
                return(STR_WCHAR_UNKNOWN);
        } else
                return(res | (((widechar_t)uch & 0x0000001F) << 6));

        if(uch & 0x10) {
                res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) &
                    0x0000003F);
        } else if((uch & 0xF0) != 0xE0) {
                (*ptr)-=2;
                return(STR_WCHAR_UNKNOWN);
        } else
                return(res | (((widechar_t)uch & 0x0000000F) << 12));

        if(uch & 0x08) {
                if((**ptr & 0xC0) != 0x80) {
                        (*ptr)-=3;
                        return(STR_WCHAR_UNKNOWN);
                }
                res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) &
                    0x0000003F);
        } else if((uch & 0xF8) != 0xF0) {
                (*ptr)-=3;
                return(STR_WCHAR_UNKNOWN);
        } else
                return(res | (((widechar_t)uch & 0x00000007) << 18));

        if(uch & 0x04)
        {
                if((**ptr & 0xC0) != 0x80) {
                        (*ptr)-=4;
                        return(STR_WCHAR_UNKNOWN);
                }
                res = (res << 6) | ((widechar_t)((unsigned char)*((*ptr)++)) &
                    0x0000003F);
        } else if((uch & 0xFC) != 0xF8) {
                (*ptr)-=4;
                return(STR_WCHAR_UNKNOWN);
        } else
                return(res | (((widechar_t)uch & 0x00000003) << 24));

        if ((uch & 0xFE) != 0xFC) {
                (*ptr)-=5;
                return(STR_WCHAR_UNKNOWN);
        }

        return (res | (((widechar_t)uch & 0x00000001) << 30));
}


/* write character ch to *ptr and point *ptr at the end */
void
utf8put(char **ptr, widechar_t ch)
{
        widechar_t bech = ch;

        if (bech < 0x00000080) {
                *((*ptr)++) = (char)bech;
                return;
        }

        if (bech < 0x00000800) {
                *((*ptr)++) =
                    (char)(((bech >> 6) & 0x0000001F) | (0x000000C0));
                *((*ptr)++) =
                    (char)(((bech) & 0x0000003F) | (0x00000080));
                return;
        }

        if (bech < 0x00010000) {
                *((*ptr)++) =
                    (char)(((bech >> 12) & 0x0000000F) | (0x000000E0));
                *((*ptr)++) =
                    (char)(((bech >> 6) & 0x0000003F) | (0x00000080));
                *((*ptr)++) =
                    (char)(((bech) & 0x0000003F) | (0x00000080));
                return;
        }

        if (bech < 0x00200000) {
                *((*ptr)++) =
                    (char)(((bech >> 18) & 0x00000007) | (0x000000F0));
                *((*ptr)++) =
                    (char)(((bech >> 12) & 0x0000003F) | (0x00000080));
                *((*ptr)++) =
                    (char)(((bech >> 6) & 0x0000003F) | (0x00000080));
                *((*ptr)++) =
                    (char)(((bech) & 0x0000003F) | (0x00000080));
                return;
        }

        if (bech < 0x04000000) {
                *((*ptr)++) =
                    (char)(((bech >> 24) & 0x00000003) | (0x000000F8));
                *((*ptr)++) =
                    (char)(((bech >> 18) & 0x0000003F) | (0x00000080));
                *((*ptr)++) =
                    (char)(((bech >> 12) & 0x0000003F) | (0x00000080));
                *((*ptr)++) =
                    (char)(((bech >> 6) & 0x0000003F) | (0x00000080));
                *((*ptr)++) =
                    (char)(((bech) & 0x0000003F) | (0x00000080));
                return;
        }

        *((*ptr)++) = (char)(((bech >> 30) & 0x00000001) | (0x000000FC));
        *((*ptr)++) = (char)(((bech >> 24) & 0x0000003F) | (0x00000080));
        *((*ptr)++) = (char)(((bech >> 18) & 0x0000003F) | (0x00000080));
        *((*ptr)++) = (char)(((bech >> 12) & 0x0000003F) | (0x00000080));
        *((*ptr)++) = (char)(((bech >> 6) & 0x0000003F) | (0x00000080));
        *((*ptr)++) = (char)(((bech) & 0x0000003F) | (0x00000080));
}


/* get the number of UTF-8 characters in a string */
int
utf8len(char *p, char *e)
{
        int n;

        n = 0;
        while (*p && p < e) {
                n++;

                if (!(*p & 0x80)) {
                        p++;
                        continue;
                } else if ((*p & 0xE0) == 0xC0) {
                        p+=2;
                        continue;
                } else if ((*p & 0xF0) == 0xE0) {
                        p+=3;
                        continue;
                } else if ((*p & 0xF8) == 0xF0) {
                        p+=4;
                        continue;
                } else if ((*p & 0xFC) == 0xF8) {
                        p+=5;
                        continue;
                } else if ((*p & 0xFE) == 0xFC) {
                        p+=6;
                        continue;
                } else
                        return (n);
        }

        return (n);
}


/* get the number of bytes the character takes when UTF-8 encoded */
int
utf8size(widechar_t ch)
{
        if (ch < 0x00000080)
                return(1);

        if (ch < 0x00000800)
                return(2);

        if (ch < 0x00010000)
                return(3);

        if (ch < 0x00200000)
                return(4);

        if (ch < 0x04000000)
                return(5);

        return (6);
}



/*
 * $Id: hash.c,v 1.2 2007/02/15 14:16:39 tin Exp $
 *
 * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <postgres.h>
#include <fmgr.h>
#include "lang.h"
#include "utf8.h"

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif


PG_FUNCTION_INFO_V1(HASH);


Datum
HASH(PG_FUNCTION_ARGS)
{
        static widechar_t input[sizeof(ABC_U)];
        static int value[sizeof(ABC_U)];
        static int count = -1;

        text *in, *out;
        char *pi, *po, *limit;
        widechar_t wc1, wc2;
        unsigned int i, size, val;

        if (count < 0) {
                count = 0;
                val = 0;
                pi = ABC_L;
                po = ABC_U;

                while ((wc1 = utf8get(&pi))) {
                        wc2 = utf8get(&po);
                        if (wc1 == (widechar_t)',') {
                                val++;
                                continue;
                        }
                        input[count] = wc1;
                        value[count] = val;
                        if(wc1 != wc2)
                        {
                                count++;
                                input[count] = wc2;
                                value[count] = val;
                        }
                        count++;
                }
        }
    
        in = PG_GETARG_TEXT_P(0);
        limit = VARDATA(in) + VARSIZE(in) - VARHDRSZ;

        size = VARHDRSZ
            + (2 * utf8len((char *)VARDATA(in), limit));
        out = (text *)palloc(size);
        VARATT_SIZEP(out) = size;

        pi = (char *)VARDATA(in);
        po = (char *)VARDATA(out);

        while (pi < limit) {
                wc1 = utf8get(&pi);
                for (i=0; i<count; i++) {
                        if(input[i] == wc1)
                                break;
                }

                if (i < count) {
                        po[0] = 65 + (value[i] / 26);
                        po[1] = 65 + (value[i] % 26);
                } else {
                        po[0] = po[1] = 'Z';
                }

                po += 2;
        }

        PG_RETURN_TEXT_P(out);
}



/*
 * $Id: lower.c,v 1.2 2007/02/15 14:16:39 tin Exp $
 *
 * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <postgres.h>
#include <fmgr.h>
#include "lang.h"
#include "utf8.h"

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif


PG_FUNCTION_INFO_V1(LOWER);


Datum
LOWER(PG_FUNCTION_ARGS)
{
        static widechar_t input[sizeof(ABC_U)];
        static widechar_t output[sizeof(ABC_U)];
        static int diff[sizeof(ABC_U)];
        static int count = -1;

        text *in, *out;
        char *pi, *po, *limit;
        widechar_t wc1, wc2;
        unsigned int i, size;

        if (count < 0) {
                count = 0;
                pi = ABC_U;
                po = ABC_L;

                while ((wc1 = utf8get(&pi))) {
                        wc2 = utf8get(&po);
                        if (wc1 != wc2) {
                                input[count] = wc1;
                                output[count] = wc2;
                                diff[count] = utf8size(wc2) - utf8size(wc1);
                                count++;
                        }
                }
        }

        in = PG_GETARG_TEXT_P(0);
        limit = VARDATA(in) + VARSIZE(in) - VARHDRSZ;

        size = VARSIZE(in);
        out = (text *)palloc(size);

        pi = (char *)VARDATA(in);
        po = (char *)VARDATA(out);

        while (pi < limit) {
                wc1 = utf8get(&pi);

                for (i=0; i<count; i++) {
                        if (input[i] == wc1)
                                break;
                }

                if (i < count) {
                        if (diff[i]) {
                                text *new_out;

                                new_out = (text *)palloc(size + diff[i]);
                                memcpy(VARDATA(new_out), VARDATA(out),
                                    size - VARHDRSZ);

                                po = VARDATA(new_out) + (po - VARDATA(out));

                                pfree(out);
                                out = new_out;
                        }
                        utf8put(&po, output[i]);

                } else
                        utf8put(&po, wc1);
        }

        VARATT_SIZEP(out) = size;

        PG_RETURN_TEXT_P(out);
}



/*
 * $Id: upper.c,v 1.2 2007/02/15 14:16:39 tin Exp $
 *
 * Copyright (c) 2004 Valentin Kozamernik <[EMAIL PROTECTED]>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <postgres.h>
#include <fmgr.h>
#include "lang.h"
#include "utf8.h"

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif


PG_FUNCTION_INFO_V1(UPPER);


Datum
UPPER(PG_FUNCTION_ARGS)
{
        static widechar_t input[sizeof(ABC_U)];
        static widechar_t output[sizeof(ABC_U)];
        static int diff[sizeof(ABC_U)];
        static int count = -1;

        text *in, *out;
        char *pi, *po, *limit;
        widechar_t wc1, wc2;
        unsigned int i, size;

        if (count < 0) {
                count = 0;
                pi = ABC_L;
                po = ABC_U;

                while ((wc1 = utf8get(&pi))) {
                        wc2 = utf8get(&po);
                        if (wc1 != wc2) {
                                input[count] = wc1;
                                output[count] = wc2;
                                diff[count] = utf8size(wc2) - utf8size(wc1);
                                count++;
                        }
                }
        }

        in = PG_GETARG_TEXT_P(0);
        limit = VARDATA(in) + VARSIZE(in) - VARHDRSZ;

        size = VARSIZE(in);
        out = (text *)palloc(size);

        pi = (char *)VARDATA(in);
        po = (char *)VARDATA(out);

        while (pi < limit) {
                wc1 = utf8get(&pi);

                for (i=0; i<count; i++) {
                        if (input[i] == wc1)
                                break;
                }

                if (i < count) {
                        if (diff[i]) {
                                text *new_out;

                                new_out = (text *)palloc(size + diff[i]);
                                memcpy(VARDATA(new_out), VARDATA(out),
                                    size - VARHDRSZ);

                                po = VARDATA(new_out) + (po - VARDATA(out));

                                pfree(out);
                                out = new_out;
                        }
                        utf8put(&po, output[i]);

                } else
                        utf8put(&po, wc1);
        }

        VARATT_SIZEP(out) = size;

        PG_RETURN_TEXT_P(out);
}


-- 
Tin

Reply via email to