Re: catopen(3) improvements

2013-06-02 Thread Stefan Sperling
On Sun, Jun 02, 2013 at 12:57:01PM -0400, vadi...@gmail.com wrote:
> On 5/31/13, Stefan Sperling  wrote:
> > +   ru-KOI8-R.msg ru.UTF-8.msg sv.ISO8859-1.msg sv.UTF-8.msg
> 
> Shouldn't it be ru.KOI8-R.msg ?

Yes, I spotted and fixed that before commit.



Re: catopen(3) improvements

2013-06-02 Thread vadimou
On 5/31/13, Stefan Sperling  wrote:
> + ru-KOI8-R.msg ru.UTF-8.msg sv.ISO8859-1.msg sv.UTF-8.msg

Shouldn't it be ru.KOI8-R.msg ?



Re: catopen(3) improvements

2013-05-31 Thread Matthew Dempsky
Nice, LGTM. :)

ok matthew



Re: catopen(3) improvements

2013-05-31 Thread Stefan Sperling
On Fri, May 31, 2013 at 06:01:14PM -0700, Matthew Dempsky wrote:
> In your man page diff, you should make use of .Dv and .Ev to identify
> the defines and environment variables.
> 
> Instead of setting FD_CLOEXEC with fcntl(), just use O_CLOEXEC when opening.
> 
> It looks like parse_lang() won't handle locales like "C.UTF-8" because
> there's no '_'.

Fixed.

> I think you should be able to implement the %l/%t/%c stuff without
> needing to call strdup() for each one (e.g., by just doing a one time
> pass over the locale string up front and noting where the '_' and '.'
> chars are).  Otherwise, you should report strdup() failures as ENOMEM
> to the catopen() caller, I think.

Nice, this makes the diff a lot smaller. Thanks!

Also using dots in filenames as suggested by bluhm.

Index: lib/libc/Makefile
===
RCS file: /cvs/src/lib/libc/Makefile,v
retrieving revision 1.31
diff -u -p -r1.31 Makefile
--- lib/libc/Makefile   15 Aug 2012 18:08:22 -  1.31
+++ lib/libc/Makefile   1 Jun 2013 01:22:57 -
@@ -17,7 +17,11 @@ LDADD=-nodefaultlibs -lgcc
 LIBCSRCDIR=${.CURDIR}
 .include "${LIBCSRCDIR}/Makefile.inc"
 
-NLS=   C.msg Pig.msg da.msg de.msg es.msg fi.msg fr.msg nl.msg no.msg ru.msg 
sv.msg it.msg
+NLS=   C.msg Pig.msg da.ISO8859-1.msg da.UTF-8.msg de.ISO8859-1.msg \
+   de.UTF-8.msg es.ISO8859-1.msg es.UTF-8.msg fi.ISO8859-1.msg \
+   fi.UTF-8.msg fr.ISO8859-1.msg fr.UTF-8.msg it.UTF-8.msg \
+   nl.ISO8859-1.msg nl.UTF-8.msg no.ISO8859-1.msg no.UTF-8.msg \
+   ru-KOI8-R.msg ru.UTF-8.msg sv.ISO8859-1.msg sv.UTF-8.msg
 
 copy-to-libkern:   copy-to-libkern-machind copy-to-libkern-machdep
 
Index: lib/libc/nls/catopen.3
===
RCS file: /cvs/src/lib/libc/nls/catopen.3,v
retrieving revision 1.6
diff -u -p -r1.6 catopen.3
--- lib/libc/nls/catopen.3  31 May 2007 19:19:30 -  1.6
+++ lib/libc/nls/catopen.3  1 Jun 2013 01:24:33 -
@@ -33,9 +33,18 @@ is used with
 substituted for
 .Ql \&%N .
 .Pp
-The
+If the
+.Fa oflag
+argument is
+.Dv NL_CAT_LOCALE ,
+the
+.Ev LC_MESSAGES
+environment variable is used to select the message catalog.
+If the
 .Fa oflag
-argument is reserved for future use and should be set to zero.
+argument is zero, the
+.Ev LANG
+environment variable is used to select the message catalog.
 .Sh RETURN VALUES
 Upon successful completion,
 .Fn catopen
@@ -43,11 +52,6 @@ returns a message catalog descriptor.
 Otherwise, \-1 is returned and
 .Va errno
 is set to indicate the error.
-.Sh ERRORS
-.Bl -tag -width Er
-.It Bq Er ENOMEM
-Insufficient memory available.
-.El
 .Sh SEE ALSO
 .Xr catclose 3 ,
 .Xr catgets 3
@@ -55,4 +59,4 @@ Insufficient memory available.
 The
 .Fn catopen
 function conforms to
-.St -xpg3 .
+.St -p1003.1-2008 .
Index: lib/libc/nls/catopen.c
===
RCS file: /cvs/src/lib/libc/nls/catopen.c,v
retrieving revision 1.14
diff -u -p -r1.14 catopen.c
--- lib/libc/nls/catopen.c  12 Jul 2011 21:31:20 -  1.14
+++ lib/libc/nls/catopen.c  1 Jun 2013 02:11:10 -
@@ -41,7 +41,7 @@
 #include 
 #include 
 
-#define NLS_DEFAULT_PATH "/usr/share/nls/%L/%N.cat:/usr/share/nls/%N/%L"
+#define NLS_DEFAULT_PATH 
"/usr/share/nls/%L/%N.cat:/usr/share/nls/%l.%c/%N.cat:/usr/share/nls/%l/%N.cat"
 #define NLS_DEFAULT_LANG "C"
 
 static nl_catd load_msgcat(const char *);
@@ -53,7 +53,7 @@ _catopen(const char *name, int oflag)
char tmppath[PATH_MAX];
char *nlspath;
char *lang;
-   char *s, *t;
+   char *s, *t, *sep, *dot;
const char *u;
nl_catd catd;

@@ -66,28 +66,66 @@ _catopen(const char *name, int oflag)
 
if (issetugid() != 0 || (nlspath = getenv("NLSPATH")) == NULL)
nlspath = NLS_DEFAULT_PATH;
-   if ((lang = getenv("LANG")) == NULL)
+
+   lang = NULL;
+   if (oflag & NL_CAT_LOCALE)
+   lang = getenv("LC_MESSAGES");
+   if (lang == NULL)
+   lang = getenv("LANG");
+   if (lang == NULL)
+   lang = NLS_DEFAULT_LANG;
+   if (strcmp(lang, "POSIX") == 0)
lang = NLS_DEFAULT_LANG;
 
s = nlspath;
-   t = tmppath;
+   t = tmppath;
+
+   /*
+* Locale names are of the form language[_territory][.codeset].
+* See POSIX-1-2008 "8.2 Internationalization Variables"
+*/
+   sep = strchr(lang, '_');
+   dot = strrchr(lang, '.');
do {
while (*s && *s != ':') {
if (*s == '%') {
switch (*(++s)) {
-   case 'L':   /* locale */
+   case 'L':   /* LANG or LC_MESSAGES */
u = lang;
while (*u && t < tmppath + PATH_MAX-1)
   

Re: catopen(3) improvements

2013-05-31 Thread Alexander Bluhm
On Sat, Jun 01, 2013 at 02:19:26AM +0200, Stefan Sperling wrote:
> -NLS= C.msg Pig.msg da.msg de.msg es.msg fi.msg fr.msg nl.msg no.msg ru.msg 
> sv.msg it.msg
> +NLS= C.msg Pig.msg da-ISO8859-1.msg da-UTF-8.msg de-ISO8859-1.msg \
> + de-UTF-8.msg es-ISO8859-1.msg es-UTF-8.msg fi-ISO8859-1.msg \
> + fi-UTF-8.msg fr-ISO8859-1.msg fr-UTF-8.msg it-UTF-8.msg \
> + nl-ISO8859-1.msg nl-UTF-8.msg no-ISO8859-1.msg no-UTF-8.msg \
> + ru-KOI8-R.msg ru-UTF-8.msg sv-ISO8859-1.msg sv-UTF-8.msg

Why not use a . to separate language from codeset here for consistency?
Like de.UTF-8

bluhm



Re: catopen(3) improvements

2013-05-31 Thread Matthew Dempsky
On Fri, May 31, 2013 at 5:43 PM, Stefan Sperling  wrote:
> Yes. Conversion currently depends on the GNU iconv port.
> If iconv existed in base we could use just the UTF-8 source files.

Hmm, I see.  In that case, I guess committing converted files makes sense.

In your man page diff, you should make use of .Dv and .Ev to identify
the defines and environment variables.

Instead of setting FD_CLOEXEC with fcntl(), just use O_CLOEXEC when opening.

It looks like parse_lang() won't handle locales like "C.UTF-8" because
there's no '_'.

I think you should be able to implement the %l/%t/%c stuff without
needing to call strdup() for each one (e.g., by just doing a one time
pass over the locale string up front and noting where the '_' and '.'
chars are).  Otherwise, you should report strdup() failures as ENOMEM
to the catopen() caller, I think.



Re: catopen(3) improvements

2013-05-31 Thread Stefan Sperling
On Fri, May 31, 2013 at 05:34:34PM -0700, Matthew Dempsky wrote:
> On Fri, May 31, 2013 at 5:19 PM, Stefan Sperling  wrote:
> > Existing lib/libc/nls/*.msg files are renamed to the names shown in
> > the libc/Makefile part of the diff, and new ones are added to support
> > the UTF-8 locale (converted from the existing .msg files with iconv).
> > I'm not including this change in the diff since that would mix four
> > different character sets in a single email.
> 
> Is it possible for us to use UTF-8 for all of the source files, but
> convert them to KOI8 or ISO-8859-1 as appropriate at build/install
> time?  Or is that what you mean by we don't have iconv in base, so
> there's no functionality in base for converting between two charsets?

Yes. Conversion currently depends on the GNU iconv port.
If iconv existed in base we could use just the UTF-8 source files.



Re: catopen(3) improvements

2013-05-31 Thread Matthew Dempsky
On Fri, May 31, 2013 at 5:19 PM, Stefan Sperling  wrote:
> Existing lib/libc/nls/*.msg files are renamed to the names shown in
> the libc/Makefile part of the diff, and new ones are added to support
> the UTF-8 locale (converted from the existing .msg files with iconv).
> I'm not including this change in the diff since that would mix four
> different character sets in a single email.

Is it possible for us to use UTF-8 for all of the source files, but
convert them to KOI8 or ISO-8859-1 as appropriate at build/install
time?  Or is that what you mean by we don't have iconv in base, so
there's no functionality in base for converting between two charsets?



catopen(3) improvements

2013-05-31 Thread Stefan Sperling
The base system supports localization of error messages in strerror().
catopen(3) is used to read files that contain translated versions
of error messages.

Currently these files have arbitrary encodings. Most are latin1,
one is UTF-8, one is KOI8. These encodings are currently used
regardless of locale, so in some cases the strings look garbled.

The diff below adds support for locale-specific messages, so that
the proper encoding is used for output. Since we lack iconv() in
base we cannot convert messages at runtime. Pre-generated message
files are needed instead.

Existing lib/libc/nls/*.msg files are renamed to the names shown in
the libc/Makefile part of the diff, and new ones are added to support
the UTF-8 locale (converted from the existing .msg files with iconv).
I'm not including this change in the diff since that would mix four
different character sets in a single email.

While here, update catopen(3) to comply with POSIX-2008.

I see no point in listing errno values in the catopen(3) man page.
The current list is incomplete and all errors originate from
library functions that catopen(3) uses.

The complete diff which includes new message files passes make build.
There are distrib/sets changes which I'm not including below either.

Index: lib/libc/Makefile
===
RCS file: /cvs/src/lib/libc/Makefile,v
retrieving revision 1.31
diff -u -p -r1.31 Makefile
--- lib/libc/Makefile   15 Aug 2012 18:08:22 -  1.31
+++ lib/libc/Makefile   31 May 2013 19:30:12 -
@@ -17,7 +17,11 @@ LDADD=-nodefaultlibs -lgcc
 LIBCSRCDIR=${.CURDIR}
 .include "${LIBCSRCDIR}/Makefile.inc"
 
-NLS=   C.msg Pig.msg da.msg de.msg es.msg fi.msg fr.msg nl.msg no.msg ru.msg 
sv.msg it.msg
+NLS=   C.msg Pig.msg da-ISO8859-1.msg da-UTF-8.msg de-ISO8859-1.msg \
+   de-UTF-8.msg es-ISO8859-1.msg es-UTF-8.msg fi-ISO8859-1.msg \
+   fi-UTF-8.msg fr-ISO8859-1.msg fr-UTF-8.msg it-UTF-8.msg \
+   nl-ISO8859-1.msg nl-UTF-8.msg no-ISO8859-1.msg no-UTF-8.msg \
+   ru-KOI8-R.msg ru-UTF-8.msg sv-ISO8859-1.msg sv-UTF-8.msg
 
 copy-to-libkern:   copy-to-libkern-machind copy-to-libkern-machdep
 
Index: lib/libc/nls/catopen.3
===
RCS file: /cvs/src/lib/libc/nls/catopen.3,v
retrieving revision 1.6
diff -u -p -r1.6 catopen.3
--- lib/libc/nls/catopen.3  31 May 2007 19:19:30 -  1.6
+++ lib/libc/nls/catopen.3  31 May 2013 19:45:37 -
@@ -33,9 +33,14 @@ is used with
 substituted for
 .Ql \&%N .
 .Pp
-The
+If the
+.Fa oflag
+argument is NL_CAT_LOCALE, the LC_MESSAGES environment variable
+is used to select the message catalog.
+If the
 .Fa oflag
-argument is reserved for future use and should be set to zero.
+argument is zero, the LANG environment variable is used to select
+the message catalog.
 .Sh RETURN VALUES
 Upon successful completion,
 .Fn catopen
@@ -43,11 +48,6 @@ returns a message catalog descriptor.
 Otherwise, \-1 is returned and
 .Va errno
 is set to indicate the error.
-.Sh ERRORS
-.Bl -tag -width Er
-.It Bq Er ENOMEM
-Insufficient memory available.
-.El
 .Sh SEE ALSO
 .Xr catclose 3 ,
 .Xr catgets 3
@@ -55,4 +55,4 @@ Insufficient memory available.
 The
 .Fn catopen
 function conforms to
-.St -xpg3 .
+.St -p1003.1-2008 .
Index: lib/libc/nls/catopen.c
===
RCS file: /cvs/src/lib/libc/nls/catopen.c,v
retrieving revision 1.14
diff -u -p -r1.14 catopen.c
--- lib/libc/nls/catopen.c  12 Jul 2011 21:31:20 -  1.14
+++ lib/libc/nls/catopen.c  31 May 2013 19:41:34 -
@@ -41,11 +41,63 @@
 #include 
 #include 
 
-#define NLS_DEFAULT_PATH "/usr/share/nls/%L/%N.cat:/usr/share/nls/%N/%L"
+#define NLS_DEFAULT_PATH 
"/usr/share/nls/%L/%N.cat:/usr/share/nls/%l-%c/%N.cat:/usr/share/nls/%l/%N.cat:"
 #define NLS_DEFAULT_LANG "C"
 
 static nl_catd load_msgcat(const char *);
 
+/*
+ * Helpers for parsing components of locale names.
+ * Locale names are of the form language[_territory][.codeset].
+ * See POSIX-1-2008 "8.2 Internationalization Variables"
+ */
+
+static char *
+parse_lang(const char *lang)
+{
+   char *sep, *locname;
+
+   locname = strdup(lang);
+   if (locname == NULL)
+   return NULL;
+
+   sep = strchr(locname, '_');
+   if (sep)
+   *sep = '\0';
+
+   return locname;
+}
+
+static char *
+parse_territory(const char *lang)
+{
+   char *sep, *territory;
+
+   sep = strchr(lang, '_');
+   if (sep && strrchr(sep + 1, '.')) {
+   territory = strdup(sep + 1);
+   if (territory) {
+   sep = strrchr(territory, '.');
+   *sep = '\0';
+   return territory;
+   }
+   }
+
+   return NULL;
+}
+
+static char *
+parse_codeset(const char *lang)
+{
+   char *sep;
+
+   sep = strrchr(lang, '.');
+   if (sep)
+   return strdup(sep