Re: Update UTF-8 locale ctype data (was: Re: ls(1) multibyte support)

2011-03-04 Thread Stefan Sperling
On Sat, Jan 15, 2011 at 12:44:51AM +0100, Stefan Sperling wrote:
 On Fri, Jan 14, 2011 at 05:21:46PM +0100, Stefan Sperling wrote:
  On Thu, Jan 06, 2011 at 07:52:19PM +0300, Alexander Polakov wrote:
   * Alexander Polakov polac...@gmail.com [110105 17:20]:
Hi,

here's an updated version.

1) en_US.UTF-8.src updates from FreeBSD
  
  Let's start with those.
  
  These changes are all fine, I checked them against Unicode 5.2.
  http://www.unicode.org/Public/5.2.0/charts/CodeCharts-noHan.pdf
  
  The diff below (from Alexander) brings us up to par with FreeBSD.
  Many updates could be made to this file to support additional
  characters listed in Unicode 5.2.0 (or even 6.0.0).
  But that can be done later.
  
  Can someone ok this? Thanks in advance.
 
 Before the ctype changes can go in, we'll need to this part from
 Alexander's diff to fix mklocale (caught by nicm@, thanks!)

Can this go in now?
Any OKs?

Index: lib/libc/locale/runetype.h
===
RCS file: /cvs/src/lib/libc/locale/runetype.h,v
retrieving revision 1.5
diff -u -p -r1.5 runetype.h
--- lib/libc/locale/runetype.h  8 Oct 2007 08:17:15 -   1.5
+++ lib/libc/locale/runetype.h  14 Jan 2011 23:34:28 -
@@ -69,9 +69,9 @@ typedef uint32_t _RuneType;
 #define_RUNETYPE_I 0x0008U /* Ideogram */
 #define_RUNETYPE_T 0x0010U /* Special */
 #define_RUNETYPE_Q 0x0020U /* Phonogram */
-#define_RUNETYPE_SWM   0xc000U/* Mask to get screen width data */
+#define_RUNETYPE_SWM   0xe000U /* Mask to get screen width 
data */
 #define_RUNETYPE_SWS   30  /* Bits to shift to get width */
-#define_RUNETYPE_SW0   0xU /* 0 width character */
+#define_RUNETYPE_SW0   0x2000U /* 0 width character */
 #define_RUNETYPE_SW1   0x4000U /* 1 width character */
 #define_RUNETYPE_SW2   0x8000U /* 2 width character */
 #define_RUNETYPE_SW3   0xc000U /* 3 width character */
Index: share/locale/ctype/en_US.UTF-8.src
===
RCS file: /cvs/src/share/locale/ctype/en_US.UTF-8.src,v
retrieving revision 1.1
diff -u -p -r1.1 en_US.UTF-8.src
--- share/locale/ctype/en_US.UTF-8.src  7 Aug 2005 10:03:45 -   1.1
+++ share/locale/ctype/en_US.UTF-8.src  15 Jan 2011 15:49:26 -
@@ -491,9 +491,9 @@ SWIDTH1   0x02b0 - 0x02ee
  * U+0300 - U+036F : Combining Diacritical Marks
  */
 
-GRAPH 0x0300 - 0x034f  0x0360 - 0x036f
-PRINT 0x0300 - 0x034f  0x0360 - 0x036f
-SWIDTH1   0x0300 - 0x034f  0x0360 - 0x036f
+GRAPH 0x0300 - 0x034e  0x0350 - 0x036f
+PRINT 0x0300 - 0x034e  0x0350 - 0x036f
+SWIDTH0   0x0300 - 0x034e  0x0350 - 0x036f
 
 MAPUPPER   0x0345 0x0399 
 
@@ -583,7 +583,7 @@ LOWER 0x04b9  0x04bb  0x04bd  0x04bf
 LOWER 0x04c8  0x04ca  0x04cc  0x04ce  0x04d1  0x04d3  0x04d5
 LOWER 0x04d7  0x04d9  0x04db  0x04dd  0x04df  0x04e1  0x04e3
 LOWER 0x04e5  0x04e7  0x04e9  0x04eb  0x04ed  0x04ef  0x04f1
-LOWER 0x04f3  0x04f5  0x04f9
+LOWER 0x04f3  0x04f5  0x04f7  0x04f9
 PUNCT 0x0482
 UPPER 0x0400 - 0x042f  0x0460  0x0462  0x0464  0x0466  0x0468
 UPPER 0x046a  0x046c  0x046e  0x0470  0x0472  0x0474  0x0476
@@ -595,9 +595,10 @@ UPPER 0x04b8  0x04ba  0x04bc  0x04be
 UPPER 0x04c5  0x04c7  0x04c9  0x04cb  0x04cd  0x04d0  0x04d2
 UPPER 0x04d4  0x04d6  0x04d8  0x04da  0x04dc  0x04de  0x04e0
 UPPER 0x04e2  0x04e4  0x04e6  0x04e8  0x04ea  0x04ec  0x04ee
-UPPER 0x04f0  0x04f2  0x04f4  0x04f8
-PRINT 0x0400 - 0x0486  0x0488 - 0x04ce  0x04d0 - 0x04f5  0x04f8  0x04f9
-SWIDTH1   0x0400 - 0x0486  0x0488 - 0x04ce  0x04d0 - 0x04f5  0x04f8  0x04f9
+UPPER 0x04f0  0x04f2  0x04f4  0x04f6  0x04f8
+PRINT 0x0400 - 0x0486  0x0488 - 0x04ce  0x04d0 - 0x04f9
+SWIDTH0   0x0483 - 0x0486  0x0488 - 0x0489
+SWIDTH1   0x0400 - 0x0482  0x048a - 0x04ce  0x04d0 - 0x04f9
 
 MAPUPPER   0x0430 - 0x044f : 0x0410 
 MAPUPPER   0x0450 - 0x045f : 0x0400 
@@ -671,6 +672,7 @@ MAPUPPER   0x04ef 0x04ee 
 MAPUPPER   0x04f1 0x04f0 
 MAPUPPER   0x04f3 0x04f2 
 MAPUPPER   0x04f5 0x04f4 
+MAPUPPER   0x04f7 0x04f6 
 MAPUPPER   0x04f9 0x04f8 
 MAPLOWER   0x0400 - 0x040f : 0x0450 
 MAPLOWER   0x0410 - 0x042f : 0x0430 
@@ -744,6 +746,7 @@ MAPLOWER   0x04ee 0x04ef 
 MAPLOWER   0x04f0 0x04f1 
 MAPLOWER   0x04f2 0x04f3 
 MAPLOWER   0x04f4 0x04f5 
+MAPLOWER   0x04f6 0x04f7 
 MAPLOWER   0x04f8 0x04f9 
 
 
@@ -1052,7 +1055,8 @@ DIGIT 0x0e50 - 0x0e59
 GRAPH 0x0e01 - 0x0e3a  0x0e3f - 0x0e5b
 PUNCT 0x0e3f  0x0e4f  0x0e5a  0x0e5b
 PRINT 0x0e01 - 0x0e3a  0x0e3f - 0x0e5b
-SWIDTH1   0x0e01 - 0x0e3a  0x0e3f - 0x0e5b
+SWIDTH0   0x0e31   0x0e34 - 0x0e3a  0x0e47 - 0x0e4e
+SWIDTH1   0x0e01 - 0x0e30  0x0e32 - 0x0e33  0x0e3f - 0x0e46  0x0e4f - 0x0e5b
 
 TODIGIT0x0e50 - 0x0e59 : 0x 
 
@@ -1283,6 +1287,14 @@ SWIDTH1   0x1800 - 0x180d  

Re: Update UTF-8 locale ctype data (was: Re: ls(1) multibyte support)

2011-01-14 Thread Stefan Sperling
On Fri, Jan 14, 2011 at 05:21:46PM +0100, Stefan Sperling wrote:
 On Thu, Jan 06, 2011 at 07:52:19PM +0300, Alexander Polakov wrote:
  * Alexander Polakov polac...@gmail.com [110105 17:20]:
   Hi,
   
   here's an updated version.
   
   1) en_US.UTF-8.src updates from FreeBSD
 
 Let's start with those.
 
 These changes are all fine, I checked them against Unicode 5.2.
 http://www.unicode.org/Public/5.2.0/charts/CodeCharts-noHan.pdf
 
 The diff below (from Alexander) brings us up to par with FreeBSD.
 Many updates could be made to this file to support additional
 characters listed in Unicode 5.2.0 (or even 6.0.0).
 But that can be done later.
 
 Can someone ok this? Thanks in advance.

Before the ctype changes can go in, we'll need to this part from
Alexander's diff to fix mklocale (caught by nicm@, thanks!)

These symbols are internal to libc, with exception of mklocale.
Can this go in during ABI lock?

Index: lib/libc/locale/runetype.h
===
RCS file: /OpenBSD/src/lib/libc/locale/runetype.h,v
retrieving revision 1.5
diff -u -r1.5 runetype.h
--- lib/libc/locale/runetype.h  8 Oct 2007 08:17:15 -   1.5
+++ lib/libc/locale/runetype.h  6 Jan 2011 16:24:20 -
@@ -69,9 +69,9 @@
 #define_RUNETYPE_I 0x0008U /* Ideogram */
 #define_RUNETYPE_T 0x0010U /* Special */
 #define_RUNETYPE_Q 0x0020U /* Phonogram */
-#define_RUNETYPE_SWM   0xc000U/* Mask to get screen width data */
+#define_RUNETYPE_SWM   0xe000U /* Mask to get screen width 
data */
 #define_RUNETYPE_SWS   30  /* Bits to shift to get width */
-#define_RUNETYPE_SW0   0xU /* 0 width character */
+#define_RUNETYPE_SW0   0x2000U /* 0 width character */
 #define_RUNETYPE_SW1   0x4000U /* 1 width character */
 #define_RUNETYPE_SW2   0x8000U /* 2 width character */
 #define_RUNETYPE_SW3   0xc000U /* 3 width character */