Yeah, it's a workaround. Since there's no concept other than
alpha/numeric/latin in tsearch2, Asian characters have to be fall in
one of them.


Ok, I see.

Pls, test attached patch - if it is good then I'll commit it at Monday to HEAD and 8.2 branches.

PS. Magnus, may I ask you to test under Windows? Thank you.

--
Teodor Sigaev                                   E-mail: [EMAIL PROTECTED]
                                                   WWW: http://www.sigaev.ru/
diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
*** ../tsearch2.orig/ts_locale.c        Fri Jan 12 10:53:11 2007
--- ./ts_locale.c       Fri Jan 12 18:10:27 2007
***************
*** 12,24 ****
  size_t
  wchar2char(char *to, const wchar_t *from, size_t len)
  {
        if (GetDatabaseEncoding() == PG_UTF8)
        {
                int                     r;
  
-               if (len == 0)
-                       return 0;
- 
                r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
                                                                NULL, NULL);
  
--- 12,24 ----
  size_t
  wchar2char(char *to, const wchar_t *from, size_t len)
  {
+       if (len == 0)
+               return 0;
+ 
        if (GetDatabaseEncoding() == PG_UTF8)
        {
                int                     r;
  
                r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
                                                                NULL, NULL);
  
***************
*** 34,50 ****
  
        return wcstombs(to, from, len);
  }
  
  size_t
  char2wchar(wchar_t *to, const char *from, size_t len)
  {
        if (GetDatabaseEncoding() == PG_UTF8)
        {
                int                     r;
  
-               if (len == 0)
-                       return 0;
- 
                r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
  
                if (!r)
--- 34,52 ----
  
        return wcstombs(to, from, len);
  }
+ #endif   /* WIN32 */
  
  size_t
  char2wchar(wchar_t *to, const char *from, size_t len)
  {
+       if (len == 0)
+               return 0;
+ 
+ #ifdef WIN32
        if (GetDatabaseEncoding() == PG_UTF8)
        {
                int                     r;
  
                r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
  
                if (!r)
***************
*** 60,88 ****
  
                return r;
        }
  
        return mbstowcs(to, from, len);
  }
- #endif   /* WIN32 */
  
  int
  _t_isalpha(const char *ptr)
  {
!       wchar_t         character;
  
!       char2wchar(&character, ptr, 1);
  
!       return iswalpha((wint_t) character);
  }
  
  int
  _t_isprint(const char *ptr)
  {
!       wchar_t         character;
  
!       char2wchar(&character, ptr, 1);
  
!       return iswprint((wint_t) character);
  }
  #endif   /* TS_USE_WIDE */
  
--- 62,105 ----
  
                return r;
        }
+       else 
+ #endif /* WIN32 */
+       if ( lc_ctype_is_c() )
+       {
+               /*
+                * pg_mb2wchar_with_len always adds trailing '\0', so 
+                * 'to' should be allocated with sufficient space 
+                */
+               return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+       }
  
        return mbstowcs(to, from, len);
  }
  
  int
  _t_isalpha(const char *ptr)
  {
!       wchar_t         character[2];
! 
!       if (lc_ctype_is_c())
!               return isalpha(TOUCHAR(ptr));
  
!       char2wchar(character, ptr, 1);
  
!       return iswalpha((wint_t) *character);
  }
  
  int
  _t_isprint(const char *ptr)
  {
!       wchar_t         character[2];
! 
!       if (lc_ctype_is_c())
!               return isprint(TOUCHAR(ptr));
  
!       char2wchar(character, ptr, 1);
  
!       return iswprint((wint_t) *character);
  }
  #endif   /* TS_USE_WIDE */
  
***************
*** 126,132 ****
                if ( wlen < 0 )
                        ereport(ERROR,
                                        
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                                        errmsg("transalation failed from 
server encoding to wchar_t")));
  
                Assert(wlen<=len);
                wstr[wlen] = 0;
--- 143,149 ----
                if ( wlen < 0 )
                        ereport(ERROR,
                                        
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                                        errmsg("translation failed from server 
encoding to wchar_t")));
  
                Assert(wlen<=len);
                wstr[wlen] = 0;
***************
*** 152,158 ****
                if ( wlen < 0 )
                        ereport(ERROR,
                                        
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                                        errmsg("transalation failed from 
wchar_t to server encoding %d", errno)));
                Assert(wlen<=len);
                out[wlen]='\0';
        }
--- 169,175 ----
                if ( wlen < 0 )
                        ereport(ERROR,
                                        
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
!                                        errmsg("translation failed from 
wchar_t to server encoding %d", errno)));
                Assert(wlen<=len);
                out[wlen]='\0';
        }
diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
*** ../tsearch2.orig/ts_locale.h        Fri Jan 12 10:53:11 2007
--- ./ts_locale.h       Fri Jan 12 18:10:19 2007
***************
*** 30,45 ****
  #define TOUCHAR(x)    (*((unsigned char*)(x)))
  
  #ifdef TS_USE_WIDE
  
  #ifdef WIN32
  
  size_t                wchar2char(char *to, const wchar_t *from, size_t len);
! size_t                char2wchar(wchar_t *to, const char *from, size_t len);
  #else                                                 /* WIN32 */
  
! /* correct mbstowcs */
! #define char2wchar mbstowcs
  #define wchar2char wcstombs
  #endif   /* WIN32 */
  
  #define t_isdigit(x)  ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
--- 30,46 ----
  #define TOUCHAR(x)    (*((unsigned char*)(x)))
  
  #ifdef TS_USE_WIDE
+ size_t                char2wchar(wchar_t *to, const char *from, size_t len);
  
  #ifdef WIN32
  
  size_t                wchar2char(char *to, const wchar_t *from, size_t len);
! 
  #else                                                 /* WIN32 */
  
! /* correct wcstombs */
  #define wchar2char wcstombs
+ 
  #endif   /* WIN32 */
  
  #define t_isdigit(x)  ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
***************
*** 55,64 ****
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) 
) : false )
  
! #define COPYCHAR(d,s) do {                            \
!       int lll = pg_mblen( s );                        \
!                                                       \
!       while( lll-- )                                  \
                TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
  } while(0)
  
--- 56,65 ----
   */
  #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) 
) : false )
  
! #define COPYCHAR(d,s) do {                                    \
!       int lll = pg_mblen( s );                                        \
!                                                                               
                \
!       while( lll-- )                                                          
\
                TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
  } while(0)
  
diff -c -r -N ../tsearch2.orig/tsearch2.patch ./tsearch2.patch
*** ../tsearch2.orig/tsearch2.patch     Thu Jan  1 03:00:00 1970
--- ./tsearch2.patch    Fri Jan 12 18:12:30 2007
***************
*** 0 ****
--- 1,243 ----
+ diff -c -r -N ../tsearch2.orig/ts_locale.c ./ts_locale.c
+ *** ../tsearch2.orig/ts_locale.c      Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.c     Fri Jan 12 18:10:27 2007
+ ***************
+ *** 12,24 ****
+   size_t
+   wchar2char(char *to, const wchar_t *from, size_t len)
+   {
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+               int                     r;
+   
+ -             if (len == 0)
+ -                     return 0;
+ - 
+               r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+                                                               NULL, NULL);
+   
+ --- 12,24 ----
+   size_t
+   wchar2char(char *to, const wchar_t *from, size_t len)
+   {
+ +     if (len == 0)
+ +             return 0;
+ + 
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+               int                     r;
+   
+               r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+                                                               NULL, NULL);
+   
+ ***************
+ *** 34,50 ****
+   
+       return wcstombs(to, from, len);
+   }
+   
+   size_t
+   char2wchar(wchar_t *to, const char *from, size_t len)
+   {
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+               int                     r;
+   
+ -             if (len == 0)
+ -                     return 0;
+ - 
+               r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+   
+               if (!r)
+ --- 34,52 ----
+   
+       return wcstombs(to, from, len);
+   }
+ + #endif   /* WIN32 */
+   
+   size_t
+   char2wchar(wchar_t *to, const char *from, size_t len)
+   {
+ +     if (len == 0)
+ +             return 0;
+ + 
+ + #ifdef WIN32
+       if (GetDatabaseEncoding() == PG_UTF8)
+       {
+               int                     r;
+   
+               r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+   
+               if (!r)
+ ***************
+ *** 60,88 ****
+   
+               return r;
+       }
+   
+       return mbstowcs(to, from, len);
+   }
+ - #endif   /* WIN32 */
+   
+   int
+   _t_isalpha(const char *ptr)
+   {
+ !     wchar_t         character;
+   
+ !     char2wchar(&character, ptr, 1);
+   
+ !     return iswalpha((wint_t) character);
+   }
+   
+   int
+   _t_isprint(const char *ptr)
+   {
+ !     wchar_t         character;
+   
+ !     char2wchar(&character, ptr, 1);
+   
+ !     return iswprint((wint_t) character);
+   }
+   #endif   /* TS_USE_WIDE */
+   
+ --- 62,105 ----
+   
+               return r;
+       }
+ +     else 
+ + #endif /* WIN32 */
+ +     if ( lc_ctype_is_c() )
+ +     {
+ +             /*
+ +              * pg_mb2wchar_with_len always adds trailing '\0', so 
+ +              * 'to' should be allocated with sufficient space 
+ +              */
+ +             return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
+ +     }
+   
+       return mbstowcs(to, from, len);
+   }
+   
+   int
+   _t_isalpha(const char *ptr)
+   {
+ !     wchar_t         character[2];
+ ! 
+ !     if (lc_ctype_is_c())
+ !             return isalpha(TOUCHAR(ptr));
+   
+ !     char2wchar(character, ptr, 1);
+   
+ !     return iswalpha((wint_t) *character);
+   }
+   
+   int
+   _t_isprint(const char *ptr)
+   {
+ !     wchar_t         character[2];
+ ! 
+ !     if (lc_ctype_is_c())
+ !             return isprint(TOUCHAR(ptr));
+   
+ !     char2wchar(character, ptr, 1);
+   
+ !     return iswprint((wint_t) *character);
+   }
+   #endif   /* TS_USE_WIDE */
+   
+ ***************
+ *** 126,132 ****
+               if ( wlen < 0 )
+                       ereport(ERROR,
+                                       
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                                      errmsg("transalation failed from 
server encoding to wchar_t")));
+   
+               Assert(wlen<=len);
+               wstr[wlen] = 0;
+ --- 143,149 ----
+               if ( wlen < 0 )
+                       ereport(ERROR,
+                                       
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                                      errmsg("translation failed from server 
encoding to wchar_t")));
+   
+               Assert(wlen<=len);
+               wstr[wlen] = 0;
+ ***************
+ *** 152,158 ****
+               if ( wlen < 0 )
+                       ereport(ERROR,
+                                       
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                                      errmsg("transalation failed from 
wchar_t to server encoding %d", errno)));
+               Assert(wlen<=len);
+               out[wlen]='\0';
+       }
+ --- 169,175 ----
+               if ( wlen < 0 )
+                       ereport(ERROR,
+                                       
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ !                                      errmsg("translation failed from 
wchar_t to server encoding %d", errno)));
+               Assert(wlen<=len);
+               out[wlen]='\0';
+       }
+ diff -c -r -N ../tsearch2.orig/ts_locale.h ./ts_locale.h
+ *** ../tsearch2.orig/ts_locale.h      Fri Jan 12 10:53:11 2007
+ --- ./ts_locale.h     Fri Jan 12 18:10:19 2007
+ ***************
+ *** 30,45 ****
+   #define TOUCHAR(x)  (*((unsigned char*)(x)))
+   
+   #ifdef TS_USE_WIDE
+   
+   #ifdef WIN32
+   
+   size_t              wchar2char(char *to, const wchar_t *from, size_t len);
+ ! size_t              char2wchar(wchar_t *to, const char *from, size_t len);
+   #else                                                       /* WIN32 */
+   
+ ! /* correct mbstowcs */
+ ! #define char2wchar mbstowcs
+   #define wchar2char wcstombs
+   #endif   /* WIN32 */
+   
+   #define t_isdigit(x)        ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ --- 30,46 ----
+   #define TOUCHAR(x)  (*((unsigned char*)(x)))
+   
+   #ifdef TS_USE_WIDE
+ + size_t              char2wchar(wchar_t *to, const char *from, size_t len);
+   
+   #ifdef WIN32
+   
+   size_t              wchar2char(char *to, const wchar_t *from, size_t len);
+ ! 
+   #else                                                       /* WIN32 */
+   
+ ! /* correct wcstombs */
+   #define wchar2char wcstombs
+ + 
+   #endif   /* WIN32 */
+   
+   #define t_isdigit(x)        ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+ ***************
+ *** 55,64 ****
+    */
+   #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned 
char)(c)) ) : false )
+   
+ ! #define COPYCHAR(d,s)       do {                            \
+ !     int lll = pg_mblen( s );                        \
+ !                                                     \
+ !     while( lll-- )                                  \
+               TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
+   } while(0)
+   
+ --- 56,65 ----
+    */
+   #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned 
char)(c)) ) : false )
+   
+ ! #define COPYCHAR(d,s)       do {                                    \
+ !     int lll = pg_mblen( s );                                        \
+ !                                                                             
                \
+ !     while( lll-- )                                                          
\
+               TOUCHAR((d)+lll) = TOUCHAR((s)+lll);    \
+   } while(0)
+   
diff -c -r -N ../tsearch2.orig/wordparser/parser.c ./wordparser/parser.c
*** ../tsearch2.orig/wordparser/parser.c        Fri Jan 12 10:53:11 2007
--- ./wordparser/parser.c       Fri Jan 12 18:10:38 2007
***************
*** 40,55 ****
  #ifdef TS_USE_WIDE
  
        /*
!        * Use wide char code only when max encoding length > 1 and ctype != C.
!        * Some operating systems fail with multi-byte encodings and a C locale.
!        * Also, for a C locale there is no need to process as multibyte. From
!        * backend/utils/adt/oracle_compat.c Teodor
         */
  
!       if (prs->charmaxlen > 1 && !lc_ctype_is_c())
        {
                prs->usewide = true;
!               prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr);
                prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
        }
        else
--- 40,52 ----
  #ifdef TS_USE_WIDE
  
        /*
!        * Use wide char code only when max encoding length > 1.
         */
  
!       if (prs->charmaxlen > 1)
        {
                prs->usewide = true;
!               prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * 
(prs->lenstr+1));
                prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
        }
        else
***************
*** 83,107 ****
  
  /*
   * defining support function, equvalent is* macroses, but
!  * working with any possible encodings and locales
   */
  
  #ifdef TS_USE_WIDE
  
! #define p_iswhat(type)                                                        
                        \
! static int                                                                    
                \
! p_is##type(TParser *prs) {                                                    
                \
!       Assert( prs->state );                                                   
                \
!       return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + 
prs->state->poschar ) ) : \
!               is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) 
);               \
! }     \
!                                                                               
                \
! static int                                                                    
                \
! p_isnot##type(TParser *prs) {                                                 
                \
!       return !p_is##type(prs);                                                
                \
  }
  
  
  
  /* p_iseq should be used only for ascii symbols */
  
--- 80,178 ----
  
  /*
   * defining support function, equvalent is* macroses, but
!  * working with any possible encodings and locales. Note,
!  * that with multibyte encoding and C-locale isw* function may fail
!  * or give wrong result. Note 2: multibyte encoding and C-locale 
!  * often are used for Asian languages.
   */
  
  #ifdef TS_USE_WIDE
  
! #define p_iswhat(type)                                                        
                                                        \
! static int                                                                    
                                                                \
! p_is##type(TParser *prs) {                                                    
                                                \
!       Assert( prs->state );                                                   
                                                \
!       if ( prs->usewide )                                                     
                                                        \
!       {                                                                       
                                                                        \
!               if ( lc_ctype_is_c() )                                          
                                                \
!                       return is##type( 0xff & *( prs->wstr + 
prs->state->poschar) );  \
!                                                                               
                                                                        \
!               return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) 
);      \
!       }                                                                       
                                                                        \
!                                                                               
                                                                        \
!       return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); 
\
! }                                                                             
                                                                        \
!                                                                               
                                                                        \
! static int                                                                    
                                                                \
! p_isnot##type(TParser *prs) {                                                 
                                        \
!       return !p_is##type(prs);                                                
                                                \
  }
  
+ static int 
+ p_isalnum(TParser *prs)
+ {
+       Assert( prs->state );
+ 
+       if (prs->usewide)
+       {
+               if (lc_ctype_is_c())
+               {
+                       unsigned int c = *(unsigned int*)(prs->wstr + 
prs->state->poschar);
+ 
+                       /*
+                        * any non-ascii symbol with multibyte encoding
+                        * with C-locale is an alpha character
+                        */
+                       if ( c > 0x7f )
+                               return 1;
+ 
+                       return isalnum(0xff & c);
+               }
+ 
+               return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
+       }
+ 
+       return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
  
+ static int
+ p_isnotalnum(TParser *prs)
+ {
+       return !p_isalnum(prs);
+ }
+ 
+ static int 
+ p_isalpha(TParser *prs)
+ {
+       Assert( prs->state );
+ 
+       if (prs->usewide)
+       {
+               if (lc_ctype_is_c())
+               {
+                       unsigned int c = *(prs->wstr + prs->state->poschar);
+ 
+                       /*
+                        * any non-ascii symbol with multibyte encoding
+                        * with C-locale is an alpha character
+                        */
+                       if ( c > 0x7f )
+                               return 1;
+ 
+                       return isalpha(0xff & c);
+               }
+ 
+               return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
+       }
+ 
+       return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
+ }
+ 
+ static int
+ p_isnotalpha(TParser *prs)
+ {
+       return !p_isalpha(prs);
+ }
  
  /* p_iseq should be used only for ascii symbols */
  
***************
*** 111,128 ****
        Assert(prs->state);
        return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) 
== c)) ? 1 : 0;
  }
  #else                                                 /* TS_USE_WIDE */
  
! #define p_iswhat(type)                                                        
                        \
! static int                                                                    
                \
! p_is##type(TParser *prs) {                                                    
                \
!       Assert( prs->state );                                                   
                \
!       return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );  
                \
! }     \
!                                                                               
                \
! static int                                                                    
                \
! p_isnot##type(TParser *prs) {                                                 
                \
!       return !p_is##type(prs);                                                
                \
  }
  
  
--- 182,200 ----
        Assert(prs->state);
        return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) 
== c)) ? 1 : 0;
  }
+ 
  #else                                                 /* TS_USE_WIDE */
  
! #define p_iswhat(type)                                                        
                                                        \
! static int                                                                    
                                                                \
! p_is##type(TParser *prs) {                                                    
                                                \
!       Assert( prs->state );                                                   
                                                \
!       return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );  
\
! }                                                                             
                                                                        \
!                                                                               
                                                                        \
! static int                                                                    
                                                                \
! p_isnot##type(TParser *prs) {                                                 
                                        \
!       return !p_is##type(prs);                                                
                                                \
  }
  
  
***************
*** 132,141 ****
        Assert(prs->state);
        return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
  }
- #endif   /* TS_USE_WIDE */
  
  p_iswhat(alnum)
  p_iswhat(alpha)
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
--- 204,215 ----
        Assert(prs->state);
        return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
  }
  
  p_iswhat(alnum)
  p_iswhat(alpha)
+ 
+ #endif   /* TS_USE_WIDE */
+ 
  p_iswhat(digit)
  p_iswhat(lower)
  p_iswhat(print)
---------------------------(end of broadcast)---------------------------
TIP 1: if posting/reading through Usenet, please send an appropriate
       subscribe-nomail command to [EMAIL PROTECTED] so that your
       message can get through to the mailing list cleanly

Reply via email to