Re: [ast-developers] rfe: .sh.regex.available_character_class array?

Roland Mainz Fri, 26 Oct 2012 14:42:30 -0700

On Fri, Oct 26, 2012 at 11:54 AM, Cedric Blancher
<cedric.blanc...@googlemail.com> wrote:
> On 22 October 2012 18:12, Roland Mainz <roland.ma...@nrubsig.org> wrote:
>> On Mon, Oct 22, 2012 at 6:14 AM, Glenn Fowler <g...@research.att.com> wrote:
>> [snip]
>>> ah but you may have been thinking getconf function and not getconf command
>>> in that case doing it with the getconf function is probably the way to go
>>
>> 1. Erm... I think you were right that locale(1) would be a "better"
>> place... but this would mean to create yet-another-builtin. Question
>> is... would you be OK with another one... this time to "intercept"
>> /usr/bin/locale and add new options to return valid values for
>> |wctype()| and |wctrans()| ?
>>
>> 2. Below is some prototype code to do the enumeration... does it
>> (generally) look OK for use in a locale(1) (or getconf(1)) ?
>> -- snip --
[snip]
>> -- snip --
>
> Roland, thanks for the test code.
>
> There are two more wctrans() classes you didn't list:
> 1. to_outpunct: This is a map from ASCII decimal point and
> thousands-sep to their equivalent in locale. This is defined for
> locales which use extra decimal point and thousands-sep.
> (LC_ALL=fa_IR ~/bin/ksh -c 'typeset -M to_outpunct x ; x="." ; print "|$x|"')
> |٫|
> (LC_ALL=fa_IR ~/bin/ksh -c 'typeset -M to_outpunct x ; x="," ; print "|$x|"')
> |٬|
>
> 2. to_inpunct: This is a map from ASCII digits to their equivalent in
> locale. This is defined for locales which use an extra digit set.
> (LC_ALL=fa_IR ~/bin/ksh -c 'typeset -M to_inpunct x ; x="." ; print "|$x|"')
> |٫|
> (LC_ALL=fa_IR ~/bin/ksh -c 'typeset -M to_inpunct x ; x="," ; print "|$x|"')
> |٬|
>
> Example application: Map ascii numbers to their Arabic counterparts:
> (LC_ALL=fa_IR ~/bin/ksh -c 'typeset -M to_inpunct m ; for ((i=1 ; i <
> 16384 ; i++ )) ; do p="$(printf "\u[$(printf "%x" i)]")" ; m="$p" ; [[
> "$p" == "$m" ]] || printf "%q != %q, %d\n" "$p" "$m" i ; done')
> , != $'\u[66c]', 44
> . != $'\u[66b]', 46
> 0 != ۰, 48
> 1 != ۱, 49
> 2 != ۲, 50
> 3 != ۳, 51
> 4 != ۴, 52
> 5 != ۵, 53
> 6 != ۶, 54
> 7 != ۷, 55
> 8 != ۸, 56
> 9 != ۹, 57


Thanks.... :-)
... and (as another native Japanese speaker pointed out... I forgot
the "jspace" character class... ;-( ) ...

... attached (as "wcsupportedlists002.c.txt") is an updated version of
the enumeration code which fixes the issues reported.

----

Bye,
Roland

-- 
  __ .  . __
 (o.\ \/ /.o) roland.ma...@nrubsig.org
  \__\/\/__/  MPEG specialist, C&&JAVA&&Sun&&Unix programmer
  /O /==\ O\  TEL +49 641 3992797
 (;O/ \/ \O;)

#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <wctype.h>
#include <locale.h>

/*
 * List of character classes. Roughly sorted by the places
 * where we found the strings
 */
static
const char *character_classes[] =
{
        /* these are the classes mandated by POSIX */
        "alnum",
        "alpha",
        "blank",
        "cntrl",
        "digit",
        "graph",
        "lower",
        "print",
        "punct",
        "space",
        "upper",
        "xdigit",
        /*
         * these are the classes sampled from various locales on
         * Solaris, FreeBSD and Apple OSX
         */
        "english",
        "gb",
        "ideogram",
        "jalpha",
        "jdigit",
        "jspace",
        "jgen",
        "jgreek",
        "jhankana",
        "jhira",
        "jisx0201r",
        "jisx0208",
        "jisx0212",
        "jkanji",
        "jkata",
        "jparen",
        "jpunct",
        "jrussian",
        "jsci",
        "jspecial",
        "junit",
        "line",
        "number",
        "phonogram",
        "special",
        /* wchar0-24 are used by IBM's/OpenGroup's libc_i18n code */
        "wchar0",
        "wchar1",
        "wchar2",
        "wchar3",
        "wchar4",
        "wchar5",
        "wchar6",
        "wchar7",
        "wchar8",
        "wchar9",
        "wchar10",
        "wchar11",
        "wchar12",
        "wchar13",
        "wchar14",
        "wchar15",
        "wchar16",
        "wchar17",
        "wchar18",
        "wchar19",
        "wchar20",
        "wchar21",
        "wchar22",
        "wchar23",
        "wchar24",
};

static
const char *wc_transformations[]={
        "tolower",
        "toupper",
        "toascii",
        "tojhira",
        "tojisx0201",
        "tojisx0208",
        "tojkata",
        "totitle",
        "to_inpunct",
        "to_outpunct"
};

#define elementsof(x)    (sizeof(x)/sizeof((x)[0]))

static
const char *get_list_of_supported_wctypes(void)
{
        int             i;
        bool            matched[elementsof(character_classes)+1];
        size_t          size = 0UL;
        const char      *cl;
        char            *s, *p;
        char            buff[128]; 

        for (i=0 ; i < elementsof(character_classes) ; i++)
        {
                cl=character_classes[i];

                /*
                 * Some old Unixes like old Solaris have some classes
                 * _accidently_ prefixed with "is" (this happens on
                 * other Unixes, too - because the matching data
                 * have been both written by the same contractors
                 * and/or cross-licensed between different
                 * companies).
                 * We work-around the issue here by testing both
                 * the plain and intended name; consumers of the
                 * values will have to do the same when calling
                 * |wctype()|.
                 */
                buff[0]='i';
                buff[1]='s';
                strcpy(&buff[2], cl);

                if (wctype(cl) || wctype(buff))
                {
                        size+=strlen(cl)+2;
                        matched[i]=true;
                }
                else
                {
                        matched[i]=false;
                }
        }

        s=p=malloc(size+1);
        if (!s)
        {
                perror("malloc() failed.");
                return (NULL);
        }

        
        for (i=0 ; i < elementsof(character_classes) ; i++)
        {
                if (matched[i])
                {
                        p=stpcpy(p, character_classes[i]);
                        *p++=' ';
                        *p='\0';
                }
        }

        if (*--p==' ')
                *p='\0';

        return (s);
}

static
const char *get_list_of_supported_wctransformations(void)
{
        int             i;
        bool            matched[elementsof(wc_transformations)+1];
        size_t          size = 0UL;
        const char      *tr;
        char            *s, *p;

        for (i=0 ; i < elementsof(wc_transformations) ; i++)
        {
                tr=wc_transformations[i];

                if (wctrans(tr))
                {
                        size+=strlen(tr)+2;
                        matched[i]=true;
                }
                else
                {
                        matched[i]=false;
                }
        }

        s=p=malloc(size+1);
        if (!s)
        {
                perror("malloc() failed.");
                return (NULL);
        }

        
        for (i=0 ; i < elementsof(wc_transformations) ; i++)
        {
                if (matched[i])
                {
                        p=stpcpy(p, wc_transformations[i]);
                        *p++=' ';
                        *p='\0';
                }
        }

        if (*--p==' ')
                *p='\0';

        return (s);
}

int main(int ac, char *av[])
{
        setlocale(LC_ALL, "");

        printf("WCTRANS: Supported values for wctrans() are: |%s|\n",
                get_list_of_supported_wctransformations());

        printf("WCTYPES: Supported values for wctype() are: |%s|\n",
                get_list_of_supported_wctypes());

        return (EXIT_SUCCESS);
}

_______________________________________________
ast-developers mailing list
ast-developers@research.att.com
https://mailman.research.att.com/mailman/listinfo/ast-developers

Re: [ast-developers] rfe: .sh.regex.available_character_class array?

Reply via email to