Re: [bug-libunistring] [PATCH 6/8] Update to Unicode 6.2.0

Daiki Ueno Fri, 10 Oct 2014 21:30:01 -0700

Daiki Ueno <[email protected]> writes:

> * lib/uniwbrk/wbrktable.c (uniwbrk_table): Support WBP_RI.  Remove
> WBP_EXTEND and WBP_FORMAT, which are now computed without using
> the table.


Oops, I forgot to include the corresponding change to
lib/uniwbrk/u-wordbreaks.h.  Please use the attached patch instead.  The
snapshot tarball is updated to -alpha2, including the uniname update:

ftp://alpha.gnu.org/gnu/libunistring/libunistring-0.9.5-alpha2.tar.xz

Regards,
-- 
Daiki Ueno

>From 8f91a5c03008d9f4f0ec0fadb3607da471ead445 Mon Sep 17 00:00:00 2001
From: Daiki Ueno <[email protected]>
Date: Fri, 10 Oct 2014 15:19:03 +0900
Subject: [PATCH 6/8] Update to Unicode 6.2.0

* lib/unilbrk/lbrktables.h (LBP_RI): New enumeration value.

* lib/uniwbrk.in.h (WBP_RI): New enumeration value.
* lib/uniwbrk/u-wordbreaks.h (FUNC): Support rule WB13c.
Normalize table index skipping ignored properties.
* lib/uniwbrk/wbrktable.c (uniwbrk_table): Support WBP_RI.  Remove
WBP_EXTEND and WBP_FORMAT, which are now computed without using
the table.
* lib/uniwbrk/wbrktable.h: Adjust table size.
* tests/uniwbrk/test-uc-wordbreaks.c
(wordbreakproperty_to_string): Support WBP_RI.

* lib/unigbrk.in.h (GBP_RI): New enumeration value.
* lib/unigbrk/uc-is-grapheme-break.c (UC_IS_GRAPHEME_BREAK):
Support rule GB8a.
(UC_GRAPHEME_BREAKS_FOR, gb_table): Support GBP_RI.
* tests/unigbrk/test-uc-is-grapheme-break.c
(graphemebreakproperty_to_string): Support GBP_RI.

* lib/gen-uni-tables.c (LBP_RI): New enumeration value.
(get_lbp, debug_output_lbp, fill_org_lbp, debug_output_org_lbp)
(output_lbp): Support LBP_RI.
(WBP_RI): New enumeration value.
(debug_output_wbp, fill_org_wbp, debug_output_org_wbp)
(output_wbp): Support WBP_RI.
(GBP_RI): New enumeration value.
(output_gbp_test, fill_org_gbp): Support GBP_RI.
---
 lib/gen-uni-tables.c                      | 31 +++++++++++++++++++++-----
 lib/unigbrk.in.h                          |  3 ++-
 lib/unigbrk/uc-is-grapheme-break.c        |  9 ++++++--
 lib/unilbrk/lbrktables.h                  |  1 +
 lib/uniwbrk.in.h                          |  3 ++-
 lib/uniwbrk/u-wordbreaks.h                | 36 +++++++++++++++++++++----------
 lib/uniwbrk/wbrktable.c                   | 24 ++++++++++-----------
 lib/uniwbrk/wbrktable.h                   |  2 +-
 tests/unigbrk/test-uc-gbrk-prop.c         |  1 +
 tests/unigbrk/test-uc-is-grapheme-break.c |  1 +
 tests/uniwbrk/test-uc-wordbreaks.c        |  1 +
 11 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index 507c55e..1f86a0f 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -32,7 +32,7 @@
                       /usr/local/share/Unidata/CompositionExclusions.txt \
                       /usr/local/share/Unidata/SpecialCasing.txt \
                       /usr/local/share/Unidata/CaseFolding.txt \
-                      6.1.0
+                      6.2.0
  */
 
 #include <stdbool.h>
@@ -6249,6 +6249,7 @@ enum
   LBP_JL = 22, /* Hangul L Jamo */
   LBP_JV = 23, /* Hangul V Jamo */
   LBP_JT = 24, /* Hangul T Jamo */
+  LBP_RI = 34, /* regional indicator */
   LBP_SA = 31, /* complex context (South East Asian) */
   LBP_XX = 32  /* unknown */
 };
@@ -6708,6 +6709,10 @@ get_lbp (unsigned int ch)
       if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
         attr |= (int64_t) 1 << LBP_JT;
 
+      /* regional indicator */
+      if (ch >= 0x1F1E6 && ch <= 0x1F1FF)
+        attr |= (int64_t) 1 << LBP_RI;
+
       /* complex context (South East Asian) */
       if (((unicode_attributes[ch].category[0] == 'C'
             && unicode_attributes[ch].category[1] == 'f')
@@ -6860,7 +6865,7 @@ get_lbp (unsigned int ch)
           || ch == 0x2064 /* INVISIBLE PLUS */
           /* Extra characters for compatibility with Unicode LineBreak.txt.  */
           || ch == 0x110BD /* KAITHI NUMBER SIGN */)
-        if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
+        if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
           {
             /* ambiguous (alphabetic) ? */
             if ((unicode_width[ch] != NULL
@@ -6985,6 +6990,7 @@ debug_output_lbp (FILE *stream)
           PRINT_BIT(attr,LBP_JL);
           PRINT_BIT(attr,LBP_JV);
           PRINT_BIT(attr,LBP_JT);
+          PRINT_BIT(attr,LBP_RI);
           PRINT_BIT(attr,LBP_SA);
           PRINT_BIT(attr,LBP_XX);
 #undef PRINT_BIT
@@ -7100,6 +7106,7 @@ fill_org_lbp (const char *linebreak_filename)
       TRY(LBP_JL)
       TRY(LBP_JV)
       TRY(LBP_JT)
+      TRY(LBP_RI)
       TRY(LBP_SA)
       TRY(LBP_XX)
 #undef TRY
@@ -7182,6 +7189,7 @@ debug_output_org_lbp (FILE *stream)
           PRINT_BIT(attr,LBP_JL);
           PRINT_BIT(attr,LBP_JV);
           PRINT_BIT(attr,LBP_JT);
+          PRINT_BIT(attr,LBP_RI);
           PRINT_BIT(attr,LBP_SA);
           PRINT_BIT(attr,LBP_XX);
 #undef PRINT_BIT
@@ -7356,6 +7364,7 @@ output_lbp (FILE *stream1, FILE *stream2)
           CASE(LBP_JL);
           CASE(LBP_JV);
           CASE(LBP_JT);
+          CASE(LBP_RI);
           CASE(LBP_SA);
           CASE(LBP_XX);
 #undef CASE
@@ -7455,7 +7464,8 @@ enum
   WBP_MIDLETTER    = 4,
   WBP_MIDNUM       = 5,
   WBP_NUMERIC      = 6,
-  WBP_EXTENDNUMLET = 7
+  WBP_EXTENDNUMLET = 7,
+  WBP_RI           = 13
 };
 
 /* Returns the word breaking property for ch, as a bit mask.  */
@@ -7523,6 +7533,9 @@ get_wbp (unsigned int ch)
       if (unicode_attributes[ch].category != NULL
           && strcmp (unicode_attributes[ch].category, "Pc") == 0)
         attr |= 1 << WBP_EXTENDNUMLET;
+
+      if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
+        attr |= 1 << WBP_RI;
     }
 
   if (attr == 0)
@@ -7568,7 +7581,9 @@ debug_output_wbp (FILE *stream)
             fprintf (stream, " Numeric");
           if (attr & (1 << WBP_EXTENDNUMLET))
             fprintf (stream, " ExtendNumLet");
-          fprintf (stream, "\n");
+          if (attr & (1 << WBP_RI))
+            fprintf (stream, " Regional_Indicator");
+         fprintf (stream, "\n");
         }
     }
 }
@@ -7653,6 +7668,7 @@ fill_org_wbp (const char *wordbreakproperty_filename)
       PROP ("MidNum", WBP_MIDNUM)
       PROP ("Numeric", WBP_NUMERIC)
       PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
+      PROP ("Regional_Indicator", WBP_RI)
 #undef PROP
         {
           fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
@@ -7699,6 +7715,7 @@ debug_output_org_wbp (FILE *stream)
           PROP ("MidNum", WBP_MIDNUM)
           PROP ("Numeric", WBP_NUMERIC)
           PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
+          PROP ("Regional_Indicator", WBP_RI)
 #undef PROP
           fprintf (stream, " ??");
           fprintf (stream, "\n");
@@ -7851,6 +7868,7 @@ output_wbp (FILE *stream)
           CASE(WBP_MIDNUM);
           CASE(WBP_NUMERIC);
           CASE(WBP_EXTENDNUMLET);
+          CASE(WBP_RI);
 #undef CASE
           default:
             abort ();
@@ -7931,7 +7949,8 @@ enum
   GBP_V            = 8,
   GBP_T            = 9,
   GBP_LV           = 10,
-  GBP_LVT          = 11
+  GBP_LVT          = 11,
+  GBP_RI           = 12
 };
 
 /* Construction of sparse 3-level tables.  */
@@ -8002,6 +8021,7 @@ output_gbp_test (const char *filename)
       CASE (GBP_T)
       CASE (GBP_LV)
       CASE (GBP_LVT)
+      CASE (GBP_RI)
 #undef CASE
         default:
           abort ();
@@ -8199,6 +8219,7 @@ fill_org_gbp (const char *graphemebreakproperty_filename)
       PROP ("T", GBP_T)
       PROP ("LV", GBP_LV)
       PROP ("LVT", GBP_LVT)
+      PROP ("Regional_Indicator", GBP_RI)
 #undef PROP
         {
           fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
diff --git a/lib/unigbrk.in.h b/lib/unigbrk.in.h
index 8335e5a..a708a8c 100644
--- a/lib/unigbrk.in.h
+++ b/lib/unigbrk.in.h
@@ -51,7 +51,8 @@ enum
   GBP_V            = 8,
   GBP_T            = 9,
   GBP_LV           = 10,
-  GBP_LVT          = 11
+  GBP_LVT          = 11,
+  GBP_RI           = 12
 };
 
 /* Return the Grapheme_Cluster_Break property of a Unicode character. */
diff --git a/lib/unigbrk/uc-is-grapheme-break.c b/lib/unigbrk/uc-is-grapheme-break.c
index 0e61e79..7d1759c 100644
--- a/lib/unigbrk/uc-is-grapheme-break.c
+++ b/lib/unigbrk/uc-is-grapheme-break.c
@@ -47,6 +47,9 @@
    /* GB8 */                                                            \
    ((A) == GBP_LVT || (A) == GBP_T) && (B) == GBP_T ? false :           \
                                                                         \
+   /* GB8a */								\
+   (A) == GBP_RI && (B) == GBP_RI ? false :				\
+									\
    /* GB9 */                                                            \
    (B) == GBP_EXTEND ? false :                                          \
                                                                         \
@@ -71,9 +74,10 @@
    | (UC_IS_GRAPHEME_BREAK(A, GBP_V)           << GBP_V)                \
    | (UC_IS_GRAPHEME_BREAK(A, GBP_T)           << GBP_T)                \
    | (UC_IS_GRAPHEME_BREAK(A, GBP_LV)          << GBP_LV)               \
-   | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT)         << GBP_LVT))
+   | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT)         << GBP_LVT)              \
+   | (UC_IS_GRAPHEME_BREAK(A, GBP_RI)          << GBP_RI))
 
-static const unsigned short int gb_table[12] =
+static const unsigned short int gb_table[13] =
   {
     UC_GRAPHEME_BREAKS_FOR(0),  /* GBP_OTHER */
     UC_GRAPHEME_BREAKS_FOR(1),  /* GBP_CR */
@@ -87,6 +91,7 @@ static const unsigned short int gb_table[12] =
     UC_GRAPHEME_BREAKS_FOR(9),  /* GBP_T */
     UC_GRAPHEME_BREAKS_FOR(10), /* GBP_LV */
     UC_GRAPHEME_BREAKS_FOR(11), /* GBP_LVT */
+    UC_GRAPHEME_BREAKS_FOR(12), /* GBP_RI */
   };
 
 bool
diff --git a/lib/unilbrk/lbrktables.h b/lib/unilbrk/lbrktables.h
index 9014573..1467926 100644
--- a/lib/unilbrk/lbrktables.h
+++ b/lib/unilbrk/lbrktables.h
@@ -59,6 +59,7 @@ enum
   LBP_JL = 22, /* Hangul L Jamo */
   LBP_JV = 23, /* Hangul V Jamo */
   LBP_JT = 24, /* Hangul T Jamo */
+  LBP_RI = 34, /* regional indicator */
   LBP_SA = 31, /* complex context (South East Asian) */
   LBP_XX = 32  /* unknown */
 };
diff --git a/lib/uniwbrk.in.h b/lib/uniwbrk.in.h
index ab4b532..c272d48 100644
--- a/lib/uniwbrk.in.h
+++ b/lib/uniwbrk.in.h
@@ -49,7 +49,8 @@ enum
   WBP_MIDLETTER    = 4,
   WBP_MIDNUM       = 5,
   WBP_NUMERIC      = 6,
-  WBP_EXTENDNUMLET = 7
+  WBP_EXTENDNUMLET = 7,
+  WBP_RI           = 13
 };
 
 /* Return the Word_Break property of a Unicode character.  */
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 33ca7eb..04d2738 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -55,16 +55,12 @@ FUNC (const UNIT *s, size_t n, char *p)
               if (last_char_prop == WBP_CR && prop == WBP_LF)
                 /* *p = 0 */;
               /* Break before and after newlines.  */
-              else if (last_char_prop >= WBP_NEWLINE
-                       /* same as:
-                          last_char_prop == WBP_CR
-                          || last_char_prop == WBP_LF
-                          || last_char_prop == WBP_NEWLINE */
-                       || prop >= WBP_NEWLINE
-                          /* same as:
-                             prop == WBP_CR
-                             || prop == WBP_LF
-                             || prop == WBP_NEWLINE */)
+              else if ((last_char_prop == WBP_CR
+                        || last_char_prop == WBP_LF
+                        || last_char_prop == WBP_NEWLINE)
+                       || (prop == WBP_CR
+                           || prop == WBP_LF
+                           || prop == WBP_NEWLINE))
                 *p = 1;
               /* Ignore Format and Extend characters.  */
               else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
@@ -85,6 +81,7 @@ FUNC (const UNIT *s, size_t n, char *p)
                           (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
                                             ExtendNumLet × ExtendNumLet (WB13a)
                          ExtendNumLet × (ALetter | Numeric | Katakana)  (WB13b)
+                               Regional_Indicator × Regional_Indicator  (WB13c)
                    */
                   /* No break across certain punctuation.  Also, disable word
                      breaks that were recognized earlier (due to lookahead of
@@ -101,10 +98,27 @@ FUNC (const UNIT *s, size_t n, char *p)
                       *last_compchar_ptr = 0;
                       /* *p = 0; */
                     }
+                  /* Break after Format and Extend characters.  */
+                  else if (last_compchar_prop == WBP_EXTEND
+                           || last_compchar_prop == WBP_FORMAT)
+                    *p = 1;
                   else
                     {
+                      /* Normalize property value to table index,
+                         skipping 5 properties: WBP_EXTEND,
+                         WBP_FORMAT, WBP_NEWLINE, WBP_CR, and
+                         WBP_LF.  */
+                      int last_compchar_prop_index = last_compchar_prop;
+                      int prop_index = prop;
+
+                      if (last_compchar_prop_index >= WBP_EXTEND)
+                        last_compchar_prop_index -= 5;
+
+                      if (prop_index >= WBP_EXTEND)
+                        prop_index -= 5;
+
                       /* Perform a single table lookup.  */
-                      if (uniwbrk_table[last_compchar_prop][prop])
+                      if (uniwbrk_table[last_compchar_prop_index][prop_index])
                         *p = 1;
                       /* else *p = 0; */
                     }
diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c
index 7cbe4d6..04bd0e5 100644
--- a/lib/uniwbrk/wbrktable.c
+++ b/lib/uniwbrk/wbrktable.c
@@ -32,21 +32,21 @@
   (ALetter | Numeric | Katakana) × ExtendNumLet                    (WB13a)
                     ExtendNumLet × ExtendNumLet                    (WB13a)
                     ExtendNumLet × (ALetter | Numeric | Katakana)  (WB13b)
+              Regional_Indicator × Regional_Indicator              (WB13c)
  */
 
-const unsigned char uniwbrk_table[10][8] =
+const unsigned char uniwbrk_table[9][9] =
 {        /* current:      OTHER            MIDNUMLET         NUMERIC         */
          /*                   KATAKANA           MIDLETTER      EXTENDNUMLET */
-         /*                          ALETTER            MIDNUM               */
+         /*                          ALETTER            MIDNUM           RI  */
   /* last */
-  /* WBP_OTHER */        {  1,    1,    1,    1,    1,    1,    1,    1 },
-  /* WBP_KATAKANA */     {  1,    0,    1,    1,    1,    1,    1,    0 },
-  /* WBP_ALETTER */      {  1,    1,    0,    1,    1,    1,    0,    0 },
-  /* WBP_MIDNUMLET */    {  1,    1,    1,    1,    1,    1,    1,    1 },
-  /* WBP_MIDLETTER */    {  1,    1,    1,    1,    1,    1,    1,    1 },
-  /* WBP_MIDNUM */       {  1,    1,    1,    1,    1,    1,    1,    1 },
-  /* WBP_NUMERIC */      {  1,    1,    0,    1,    1,    1,    0,    0 },
-  /* WBP_EXTENDNUMLET */ {  1,    0,    0,    1,    1,    1,    0,    0 },
-  /* WBP_EXTEND */       {  1,    1,    1,    1,    1,    1,    1,    1 },
-  /* WBP_FORMAT */       {  1,    1,    1,    1,    1,    1,    1,    1 }
+  /* WBP_OTHER */        {  1,    1,    1,    1,    1,    1,    1,    1,    1 },
+  /* WBP_KATAKANA */     {  1,    0,    1,    1,    1,    1,    1,    0,    1 },
+  /* WBP_ALETTER */      {  1,    1,    0,    1,    1,    1,    0,    0,    1 },
+  /* WBP_MIDNUMLET */    {  1,    1,    1,    1,    1,    1,    1,    1,    1 },
+  /* WBP_MIDLETTER */    {  1,    1,    1,    1,    1,    1,    1,    1,    1 },
+  /* WBP_MIDNUM */       {  1,    1,    1,    1,    1,    1,    1,    1,    1 },
+  /* WBP_NUMERIC */      {  1,    1,    0,    1,    1,    1,    0,    0,    1 },
+  /* WBP_EXTENDNUMLET */ {  1,    0,    0,    1,    1,    1,    0,    0,    1 },
+  /* WBP_RI */           {  1,    1,    1,    1,    1,    1,    1,    1,    0 }
 };
diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h
index 1b48adf..50b7823 100644
--- a/lib/uniwbrk/wbrktable.h
+++ b/lib/uniwbrk/wbrktable.h
@@ -15,4 +15,4 @@
    You should have received a copy of the GNU Lesser General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 
-extern const unsigned char uniwbrk_table[10][8];
+extern const unsigned char uniwbrk_table[9][9];
diff --git a/tests/unigbrk/test-uc-gbrk-prop.c b/tests/unigbrk/test-uc-gbrk-prop.c
index 1c71280..4bfbdba 100644
--- a/tests/unigbrk/test-uc-gbrk-prop.c
+++ b/tests/unigbrk/test-uc-gbrk-prop.c
@@ -50,6 +50,7 @@ graphemebreakproperty_to_string (int gbp)
       CASE(T)
       CASE(LV)
       CASE(LVT)
+      CASE(RI)
     }
   abort ();
 }
diff --git a/tests/unigbrk/test-uc-is-grapheme-break.c b/tests/unigbrk/test-uc-is-grapheme-break.c
index a93f6f2..dbaf3dc 100644
--- a/tests/unigbrk/test-uc-is-grapheme-break.c
+++ b/tests/unigbrk/test-uc-is-grapheme-break.c
@@ -44,6 +44,7 @@ graphemebreakproperty_to_string (int gbp)
       CASE(T)
       CASE(LV)
       CASE(LVT)
+      CASE(RI)
     }
   abort ();
 }
diff --git a/tests/uniwbrk/test-uc-wordbreaks.c b/tests/uniwbrk/test-uc-wordbreaks.c
index 87e0e05..710f583 100644
--- a/tests/uniwbrk/test-uc-wordbreaks.c
+++ b/tests/uniwbrk/test-uc-wordbreaks.c
@@ -44,6 +44,7 @@ wordbreakproperty_to_string (int wbp)
       CASE(MIDNUM)
       CASE(NUMERIC)
       CASE(EXTENDNUMLET)
+      CASE(RI)
     }
   abort ();
 }
-- 
2.1.1

Re: [bug-libunistring] [PATCH 6/8] Update to Unicode 6.2.0

Reply via email to