On Fri, Oct 26, 2012 at 11:54 AM, Cedric Blancher
<[email protected]> wrote:
> On 22 October 2012 18:12, Roland Mainz <[email protected]> wrote:
>> On Mon, Oct 22, 2012 at 6:14 AM, Glenn Fowler <[email protected]> wrote:
>> [snip]
>>> ah but you may have been thinking getconf function and not getconf command
>>> in that case doing it with the getconf function is probably the way to go
>>
>> 1. Erm... I think you were right that locale(1) would be a "better"
>> place... but this would mean to create yet-another-builtin. Question
>> is... would you be OK with another one... this time to "intercept"
>> /usr/bin/locale and add new options to return valid values for
>> |wctype()| and |wctrans()| ?
>>
>> 2. Below is some prototype code to do the enumeration... does it
>> (generally) look OK for use in a locale(1) (or getconf(1)) ?
>> -- snip --
[snip]
>> -- snip --
>
> Roland, thanks for the test code.
>
> There are two more wctrans() classes you didn't list:
> 1. to_outpunct: This is a map from ASCII decimal point and
> thousands-sep to their equivalent in locale. This is defined for
> locales which use extra decimal point and thousands-sep.
> (LC_ALL=fa_IR ~/bin/ksh -c 'typeset -M to_outpunct x ; x="." ; print "|$x|"')
> |٫|
> (LC_ALL=fa_IR ~/bin/ksh -c 'typeset -M to_outpunct x ; x="," ; print "|$x|"')
> |٬|
>
> 2. to_inpunct: This is a map from ASCII digits to their equivalent in
> locale. This is defined for locales which use an extra digit set.
> (LC_ALL=fa_IR ~/bin/ksh -c 'typeset -M to_inpunct x ; x="." ; print "|$x|"')
> |٫|
> (LC_ALL=fa_IR ~/bin/ksh -c 'typeset -M to_inpunct x ; x="," ; print "|$x|"')
> |٬|
>
> Example application: Map ascii numbers to their Arabic counterparts:
> (LC_ALL=fa_IR ~/bin/ksh -c 'typeset -M to_inpunct m ; for ((i=1 ; i <
> 16384 ; i++ )) ; do p="$(printf "\u[$(printf "%x" i)]")" ; m="$p" ; [[
> "$p" == "$m" ]] || printf "%q != %q, %d\n" "$p" "$m" i ; done')
> , != $'\u[66c]', 44
> . != $'\u[66b]', 46
> 0 != ۰, 48
> 1 != ۱, 49
> 2 != ۲, 50
> 3 != ۳, 51
> 4 != ۴, 52
> 5 != ۵, 53
> 6 != ۶, 54
> 7 != ۷, 55
> 8 != ۸, 56
> 9 != ۹, 57
Thanks.... :-)
... and (as another native Japanese speaker pointed out... I forgot
the "jspace" character class... ;-( ) ...
... attached (as "wcsupportedlists002.c.txt") is an updated version of
the enumeration code which fixes the issues reported.
----
Bye,
Roland
--
__ . . __
(o.\ \/ /.o) [email protected]
\__\/\/__/ MPEG specialist, C&&JAVA&&Sun&&Unix programmer
/O /==\ O\ TEL +49 641 3992797
(;O/ \/ \O;)
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <wctype.h>
#include <locale.h>
/*
* List of character classes. Roughly sorted by the places
* where we found the strings
*/
static
const char *character_classes[] =
{
/* these are the classes mandated by POSIX */
"alnum",
"alpha",
"blank",
"cntrl",
"digit",
"graph",
"lower",
"print",
"punct",
"space",
"upper",
"xdigit",
/*
* these are the classes sampled from various locales on
* Solaris, FreeBSD and Apple OSX
*/
"english",
"gb",
"ideogram",
"jalpha",
"jdigit",
"jspace",
"jgen",
"jgreek",
"jhankana",
"jhira",
"jisx0201r",
"jisx0208",
"jisx0212",
"jkanji",
"jkata",
"jparen",
"jpunct",
"jrussian",
"jsci",
"jspecial",
"junit",
"line",
"number",
"phonogram",
"special",
/* wchar0-24 are used by IBM's/OpenGroup's libc_i18n code */
"wchar0",
"wchar1",
"wchar2",
"wchar3",
"wchar4",
"wchar5",
"wchar6",
"wchar7",
"wchar8",
"wchar9",
"wchar10",
"wchar11",
"wchar12",
"wchar13",
"wchar14",
"wchar15",
"wchar16",
"wchar17",
"wchar18",
"wchar19",
"wchar20",
"wchar21",
"wchar22",
"wchar23",
"wchar24",
};
static
const char *wc_transformations[]={
"tolower",
"toupper",
"toascii",
"tojhira",
"tojisx0201",
"tojisx0208",
"tojkata",
"totitle",
"to_inpunct",
"to_outpunct"
};
#define elementsof(x) (sizeof(x)/sizeof((x)[0]))
static
const char *get_list_of_supported_wctypes(void)
{
int i;
bool matched[elementsof(character_classes)+1];
size_t size = 0UL;
const char *cl;
char *s, *p;
char buff[128];
for (i=0 ; i < elementsof(character_classes) ; i++)
{
cl=character_classes[i];
/*
* Some old Unixes like old Solaris have some classes
* _accidently_ prefixed with "is" (this happens on
* other Unixes, too - because the matching data
* have been both written by the same contractors
* and/or cross-licensed between different
* companies).
* We work-around the issue here by testing both
* the plain and intended name; consumers of the
* values will have to do the same when calling
* |wctype()|.
*/
buff[0]='i';
buff[1]='s';
strcpy(&buff[2], cl);
if (wctype(cl) || wctype(buff))
{
size+=strlen(cl)+2;
matched[i]=true;
}
else
{
matched[i]=false;
}
}
s=p=malloc(size+1);
if (!s)
{
perror("malloc() failed.");
return (NULL);
}
for (i=0 ; i < elementsof(character_classes) ; i++)
{
if (matched[i])
{
p=stpcpy(p, character_classes[i]);
*p++=' ';
*p='\0';
}
}
if (*--p==' ')
*p='\0';
return (s);
}
static
const char *get_list_of_supported_wctransformations(void)
{
int i;
bool matched[elementsof(wc_transformations)+1];
size_t size = 0UL;
const char *tr;
char *s, *p;
for (i=0 ; i < elementsof(wc_transformations) ; i++)
{
tr=wc_transformations[i];
if (wctrans(tr))
{
size+=strlen(tr)+2;
matched[i]=true;
}
else
{
matched[i]=false;
}
}
s=p=malloc(size+1);
if (!s)
{
perror("malloc() failed.");
return (NULL);
}
for (i=0 ; i < elementsof(wc_transformations) ; i++)
{
if (matched[i])
{
p=stpcpy(p, wc_transformations[i]);
*p++=' ';
*p='\0';
}
}
if (*--p==' ')
*p='\0';
return (s);
}
int main(int ac, char *av[])
{
setlocale(LC_ALL, "");
printf("WCTRANS: Supported values for wctrans() are: |%s|\n",
get_list_of_supported_wctransformations());
printf("WCTYPES: Supported values for wctype() are: |%s|\n",
get_list_of_supported_wctypes());
return (EXIT_SUCCESS);
}
_______________________________________________
ast-users mailing list
[email protected]
https://mailman.research.att.com/mailman/listinfo/ast-users