Re: [ast-developers] rfe: .sh.regex.available_character_class array?

Roland Mainz Mon, 22 Oct 2012 09:12:16 -0700

On Mon, Oct 22, 2012 at 6:14 AM, Glenn Fowler <g...@research.att.com> wrote:
[snip]
> ah but you may have been thinking getconf function and not getconf command
> in that case doing it with the getconf function is probably the way to go


1. Erm... I think you were right that locale(1) would be a "better"
place... but this would mean to create yet-another-builtin. Question
is... would you be OK with another one... this time to "intercept"
/usr/bin/locale and add new options to return valid values for
|wctype()| and |wctrans()| ?

2. Below is some prototype code to do the enumeration... does it
(generally) look OK for use in a locale(1) (or getconf(1)) ?
-- snip --
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <wctype.h>
#include <locale.h>

const char *character_classes[] =
{
        /* these are the classes mandated by POSIX */
        "alnum",
        "alpha",
        "blank",
        "cntrl",
        "digit",
        "graph",
        "lower",
        "print",
        "punct",
        "space",
        "upper",
        "xdigit",
        /*
         * these are the classes sampled from various locales on
         * Solaris, FreeBSD and Apple OSX
         */
        "english",
        "gb",
        "ideogram",
        "jalpha",
        "jdigit",
        "jgen",
        "jgreek",
        "jhankana",
        "jhira",
        "jisx0201r",
        "jisx0208",
        "jisx0212",
        "jkanji",
        "jkata",
        "jparen",
        "jpunct",
        "jrussian",
        "jsci",
        "jspecial",
        "junit",
        "line",
        "number",
        "phonogram",
        "special",
        "wchar0",
        "wchar1",
        "wchar2",
        "wchar3",
        "wchar4",
        "wchar5",
        "wchar6",
        "wchar7",
        "wchar8",
        "wchar9",
        "wchar10",
        "wchar11",
        "wchar12",
        "wchar13",
        "wchar14",
        "wchar15",
        "wchar16",
        "wchar17",
        "wchar18",
        "wchar19",
        "wchar20",
        "wchar21",
        "wchar22",
        "wchar23",
        "wchar24",
};

const char *wc_transformations[]={
        "tolower",
        "toupper",
        "toascii",
        "tojhira",
        "tojisx0201",
        "tojisx0208",
        "tojkata",
        "totitle",
};

#define elementsof(x)    (sizeof(x)/sizeof(x[0]))

static
const char *get_list_of_supported_wctypes(void)
{
        int             i;
        bool            matched[elementsof(character_classes)+1];
        size_t          size = 0UL;
        const char      *cl;
        char            *s, *p;
        char            buff[128];

        for (i=0 ; i < elementsof(character_classes) ; i++)
        {
                cl=character_classes[i];

                /*
                 * Some old Unixes like old Solaris have some classes
                 * _accidently_ prefixed with "is" (this happens on
                 * other Unixes, too - because the matching data
                 * have been both written by the same contractors
                 * and/or cross-licensed between different
                 * companies).
                 * We work-around the issue here by testing both
                 * the plain and intended name.
                 */
                buff[0]='i';
                buff[1]='s';
                strcpy(&buff[2], cl);

                if (wctype(cl) || wctype(buff))
                {
                        size+=strlen(cl)+2;
                        matched[i]=true;
                }
                else
                {
                        matched[i]=false;
                }
        }

        s=p=malloc(size+1);
        if (!s)
        {
                perror("malloc() failed.");
                return (NULL);
        }

        
        for (i=0 ; i < elementsof(character_classes) ; i++)
        {
                if (matched[i])
                {
                        p=stpcpy(p, character_classes[i]);
                        *p=' ';
                        *++p='\0';
                }
        }

        if (*--p==' ')
                *p='\0';

        return (s);
}

static
const char *get_list_of_supported_wctransformations(void)
{
        int             i;
        bool            matched[elementsof(wc_transformations)+1];
        size_t          size = 0UL;
        const char      *tr;
        char            *s, *p;

        for (i=0 ; i < elementsof(wc_transformations) ; i++)
        {
                tr=wc_transformations[i];

                if (wctrans(tr))
                {
                        size+=strlen(tr)+2;
                        matched[i]=true;
                }
                else
                {
                        matched[i]=false;
                }
        }

        s=p=malloc(size+1);
        if (!s)
        {
                perror("malloc() failed.");
                return (NULL);
        }

        
        for (i=0 ; i < elementsof(wc_transformations) ; i++)
        {
                if (matched[i])
                {
                        p=stpcpy(p, wc_transformations[i]);
                        *p=' ';
                        *++p='\0';
                }
        }

        if (*--p==' ')
                *p='\0';

        return (s);
}

int main(int ac, char *av[])
{
        setlocale(LC_ALL, "");
        printf("Supported values for wctrans() are: |%s|\n",
                get_list_of_supported_wctransformations());
        return (EXIT_SUCCESS);
}
-- snip --

----

Bye,
Roland

-- 
  __ .  . __
 (o.\ \/ /.o) roland.ma...@nrubsig.org
  \__\/\/__/  MPEG specialist, C&&JAVA&&Sun&&Unix programmer
  /O /==\ O\  TEL +49 641 3992797
 (;O/ \/ \O;)
_______________________________________________
ast-developers mailing list
ast-developers@research.att.com
https://mailman.research.att.com/mailman/listinfo/ast-developers

Re: [ast-developers] rfe: .sh.regex.available_character_class array?

Reply via email to