Daiki Ueno <[email protected]> writes: > * lib/uniwbrk/wbrktable.c (uniwbrk_table): Support WBP_RI. Remove > WBP_EXTEND and WBP_FORMAT, which are now computed without using > the table.
Oops, I forgot to include the corresponding change to lib/uniwbrk/u-wordbreaks.h. Please use the attached patch instead. The snapshot tarball is updated to -alpha2, including the uniname update: ftp://alpha.gnu.org/gnu/libunistring/libunistring-0.9.5-alpha2.tar.xz Regards, -- Daiki Ueno
>From 8f91a5c03008d9f4f0ec0fadb3607da471ead445 Mon Sep 17 00:00:00 2001 From: Daiki Ueno <[email protected]> Date: Fri, 10 Oct 2014 15:19:03 +0900 Subject: [PATCH 6/8] Update to Unicode 6.2.0 * lib/unilbrk/lbrktables.h (LBP_RI): New enumeration value. * lib/uniwbrk.in.h (WBP_RI): New enumeration value. * lib/uniwbrk/u-wordbreaks.h (FUNC): Support rule WB13c. Normalize table index skipping ignored properties. * lib/uniwbrk/wbrktable.c (uniwbrk_table): Support WBP_RI. Remove WBP_EXTEND and WBP_FORMAT, which are now computed without using the table. * lib/uniwbrk/wbrktable.h: Adjust table size. * tests/uniwbrk/test-uc-wordbreaks.c (wordbreakproperty_to_string): Support WBP_RI. * lib/unigbrk.in.h (GBP_RI): New enumeration value. * lib/unigbrk/uc-is-grapheme-break.c (UC_IS_GRAPHEME_BREAK): Support rule GB8a. (UC_GRAPHEME_BREAKS_FOR, gb_table): Support GBP_RI. * tests/unigbrk/test-uc-is-grapheme-break.c (graphemebreakproperty_to_string): Support GBP_RI. * lib/gen-uni-tables.c (LBP_RI): New enumeration value. (get_lbp, debug_output_lbp, fill_org_lbp, debug_output_org_lbp) (output_lbp): Support LBP_RI. (WBP_RI): New enumeration value. (debug_output_wbp, fill_org_wbp, debug_output_org_wbp) (output_wbp): Support WBP_RI. (GBP_RI): New enumeration value. (output_gbp_test, fill_org_gbp): Support GBP_RI. --- lib/gen-uni-tables.c | 31 +++++++++++++++++++++----- lib/unigbrk.in.h | 3 ++- lib/unigbrk/uc-is-grapheme-break.c | 9 ++++++-- lib/unilbrk/lbrktables.h | 1 + lib/uniwbrk.in.h | 3 ++- lib/uniwbrk/u-wordbreaks.h | 36 +++++++++++++++++++++---------- lib/uniwbrk/wbrktable.c | 24 ++++++++++----------- lib/uniwbrk/wbrktable.h | 2 +- tests/unigbrk/test-uc-gbrk-prop.c | 1 + tests/unigbrk/test-uc-is-grapheme-break.c | 1 + tests/uniwbrk/test-uc-wordbreaks.c | 1 + 11 files changed, 79 insertions(+), 33 deletions(-) diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c index 507c55e..1f86a0f 100644 --- a/lib/gen-uni-tables.c +++ b/lib/gen-uni-tables.c @@ -32,7 +32,7 @@ /usr/local/share/Unidata/CompositionExclusions.txt \ /usr/local/share/Unidata/SpecialCasing.txt \ /usr/local/share/Unidata/CaseFolding.txt \ - 6.1.0 + 6.2.0 */ #include <stdbool.h> @@ -6249,6 +6249,7 @@ enum LBP_JL = 22, /* Hangul L Jamo */ LBP_JV = 23, /* Hangul V Jamo */ LBP_JT = 24, /* Hangul T Jamo */ + LBP_RI = 34, /* regional indicator */ LBP_SA = 31, /* complex context (South East Asian) */ LBP_XX = 32 /* unknown */ }; @@ -6708,6 +6709,10 @@ get_lbp (unsigned int ch) if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB)) attr |= (int64_t) 1 << LBP_JT; + /* regional indicator */ + if (ch >= 0x1F1E6 && ch <= 0x1F1FF) + attr |= (int64_t) 1 << LBP_RI; + /* complex context (South East Asian) */ if (((unicode_attributes[ch].category[0] == 'C' && unicode_attributes[ch].category[1] == 'f') @@ -6860,7 +6865,7 @@ get_lbp (unsigned int ch) || ch == 0x2064 /* INVISIBLE PLUS */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x110BD /* KAITHI NUMBER SIGN */) - if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))) + if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))) { /* ambiguous (alphabetic) ? */ if ((unicode_width[ch] != NULL @@ -6985,6 +6990,7 @@ debug_output_lbp (FILE *stream) PRINT_BIT(attr,LBP_JL); PRINT_BIT(attr,LBP_JV); PRINT_BIT(attr,LBP_JT); + PRINT_BIT(attr,LBP_RI); PRINT_BIT(attr,LBP_SA); PRINT_BIT(attr,LBP_XX); #undef PRINT_BIT @@ -7100,6 +7106,7 @@ fill_org_lbp (const char *linebreak_filename) TRY(LBP_JL) TRY(LBP_JV) TRY(LBP_JT) + TRY(LBP_RI) TRY(LBP_SA) TRY(LBP_XX) #undef TRY @@ -7182,6 +7189,7 @@ debug_output_org_lbp (FILE *stream) PRINT_BIT(attr,LBP_JL); PRINT_BIT(attr,LBP_JV); PRINT_BIT(attr,LBP_JT); + PRINT_BIT(attr,LBP_RI); PRINT_BIT(attr,LBP_SA); PRINT_BIT(attr,LBP_XX); #undef PRINT_BIT @@ -7356,6 +7364,7 @@ output_lbp (FILE *stream1, FILE *stream2) CASE(LBP_JL); CASE(LBP_JV); CASE(LBP_JT); + CASE(LBP_RI); CASE(LBP_SA); CASE(LBP_XX); #undef CASE @@ -7455,7 +7464,8 @@ enum WBP_MIDLETTER = 4, WBP_MIDNUM = 5, WBP_NUMERIC = 6, - WBP_EXTENDNUMLET = 7 + WBP_EXTENDNUMLET = 7, + WBP_RI = 13 }; /* Returns the word breaking property for ch, as a bit mask. */ @@ -7523,6 +7533,9 @@ get_wbp (unsigned int ch) if (unicode_attributes[ch].category != NULL && strcmp (unicode_attributes[ch].category, "Pc") == 0) attr |= 1 << WBP_EXTENDNUMLET; + + if (((get_lbp (ch) >> LBP_RI) & 1) != 0) + attr |= 1 << WBP_RI; } if (attr == 0) @@ -7568,7 +7581,9 @@ debug_output_wbp (FILE *stream) fprintf (stream, " Numeric"); if (attr & (1 << WBP_EXTENDNUMLET)) fprintf (stream, " ExtendNumLet"); - fprintf (stream, "\n"); + if (attr & (1 << WBP_RI)) + fprintf (stream, " Regional_Indicator"); + fprintf (stream, "\n"); } } } @@ -7653,6 +7668,7 @@ fill_org_wbp (const char *wordbreakproperty_filename) PROP ("MidNum", WBP_MIDNUM) PROP ("Numeric", WBP_NUMERIC) PROP ("ExtendNumLet", WBP_EXTENDNUMLET) + PROP ("Regional_Indicator", WBP_RI) #undef PROP { fprintf (stderr, "unknown property value '%s' in '%s'\n", propname, @@ -7699,6 +7715,7 @@ debug_output_org_wbp (FILE *stream) PROP ("MidNum", WBP_MIDNUM) PROP ("Numeric", WBP_NUMERIC) PROP ("ExtendNumLet", WBP_EXTENDNUMLET) + PROP ("Regional_Indicator", WBP_RI) #undef PROP fprintf (stream, " ??"); fprintf (stream, "\n"); @@ -7851,6 +7868,7 @@ output_wbp (FILE *stream) CASE(WBP_MIDNUM); CASE(WBP_NUMERIC); CASE(WBP_EXTENDNUMLET); + CASE(WBP_RI); #undef CASE default: abort (); @@ -7931,7 +7949,8 @@ enum GBP_V = 8, GBP_T = 9, GBP_LV = 10, - GBP_LVT = 11 + GBP_LVT = 11, + GBP_RI = 12 }; /* Construction of sparse 3-level tables. */ @@ -8002,6 +8021,7 @@ output_gbp_test (const char *filename) CASE (GBP_T) CASE (GBP_LV) CASE (GBP_LVT) + CASE (GBP_RI) #undef CASE default: abort (); @@ -8199,6 +8219,7 @@ fill_org_gbp (const char *graphemebreakproperty_filename) PROP ("T", GBP_T) PROP ("LV", GBP_LV) PROP ("LVT", GBP_LVT) + PROP ("Regional_Indicator", GBP_RI) #undef PROP { fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname, diff --git a/lib/unigbrk.in.h b/lib/unigbrk.in.h index 8335e5a..a708a8c 100644 --- a/lib/unigbrk.in.h +++ b/lib/unigbrk.in.h @@ -51,7 +51,8 @@ enum GBP_V = 8, GBP_T = 9, GBP_LV = 10, - GBP_LVT = 11 + GBP_LVT = 11, + GBP_RI = 12 }; /* Return the Grapheme_Cluster_Break property of a Unicode character. */ diff --git a/lib/unigbrk/uc-is-grapheme-break.c b/lib/unigbrk/uc-is-grapheme-break.c index 0e61e79..7d1759c 100644 --- a/lib/unigbrk/uc-is-grapheme-break.c +++ b/lib/unigbrk/uc-is-grapheme-break.c @@ -47,6 +47,9 @@ /* GB8 */ \ ((A) == GBP_LVT || (A) == GBP_T) && (B) == GBP_T ? false : \ \ + /* GB8a */ \ + (A) == GBP_RI && (B) == GBP_RI ? false : \ + \ /* GB9 */ \ (B) == GBP_EXTEND ? false : \ \ @@ -71,9 +74,10 @@ | (UC_IS_GRAPHEME_BREAK(A, GBP_V) << GBP_V) \ | (UC_IS_GRAPHEME_BREAK(A, GBP_T) << GBP_T) \ | (UC_IS_GRAPHEME_BREAK(A, GBP_LV) << GBP_LV) \ - | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT) << GBP_LVT)) + | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT) << GBP_LVT) \ + | (UC_IS_GRAPHEME_BREAK(A, GBP_RI) << GBP_RI)) -static const unsigned short int gb_table[12] = +static const unsigned short int gb_table[13] = { UC_GRAPHEME_BREAKS_FOR(0), /* GBP_OTHER */ UC_GRAPHEME_BREAKS_FOR(1), /* GBP_CR */ @@ -87,6 +91,7 @@ static const unsigned short int gb_table[12] = UC_GRAPHEME_BREAKS_FOR(9), /* GBP_T */ UC_GRAPHEME_BREAKS_FOR(10), /* GBP_LV */ UC_GRAPHEME_BREAKS_FOR(11), /* GBP_LVT */ + UC_GRAPHEME_BREAKS_FOR(12), /* GBP_RI */ }; bool diff --git a/lib/unilbrk/lbrktables.h b/lib/unilbrk/lbrktables.h index 9014573..1467926 100644 --- a/lib/unilbrk/lbrktables.h +++ b/lib/unilbrk/lbrktables.h @@ -59,6 +59,7 @@ enum LBP_JL = 22, /* Hangul L Jamo */ LBP_JV = 23, /* Hangul V Jamo */ LBP_JT = 24, /* Hangul T Jamo */ + LBP_RI = 34, /* regional indicator */ LBP_SA = 31, /* complex context (South East Asian) */ LBP_XX = 32 /* unknown */ }; diff --git a/lib/uniwbrk.in.h b/lib/uniwbrk.in.h index ab4b532..c272d48 100644 --- a/lib/uniwbrk.in.h +++ b/lib/uniwbrk.in.h @@ -49,7 +49,8 @@ enum WBP_MIDLETTER = 4, WBP_MIDNUM = 5, WBP_NUMERIC = 6, - WBP_EXTENDNUMLET = 7 + WBP_EXTENDNUMLET = 7, + WBP_RI = 13 }; /* Return the Word_Break property of a Unicode character. */ diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h index 33ca7eb..04d2738 100644 --- a/lib/uniwbrk/u-wordbreaks.h +++ b/lib/uniwbrk/u-wordbreaks.h @@ -55,16 +55,12 @@ FUNC (const UNIT *s, size_t n, char *p) if (last_char_prop == WBP_CR && prop == WBP_LF) /* *p = 0 */; /* Break before and after newlines. */ - else if (last_char_prop >= WBP_NEWLINE - /* same as: - last_char_prop == WBP_CR - || last_char_prop == WBP_LF - || last_char_prop == WBP_NEWLINE */ - || prop >= WBP_NEWLINE - /* same as: - prop == WBP_CR - || prop == WBP_LF - || prop == WBP_NEWLINE */) + else if ((last_char_prop == WBP_CR + || last_char_prop == WBP_LF + || last_char_prop == WBP_NEWLINE) + || (prop == WBP_CR + || prop == WBP_LF + || prop == WBP_NEWLINE)) *p = 1; /* Ignore Format and Extend characters. */ else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) @@ -85,6 +81,7 @@ FUNC (const UNIT *s, size_t n, char *p) (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) ExtendNumLet × ExtendNumLet (WB13a) ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) + Regional_Indicator × Regional_Indicator (WB13c) */ /* No break across certain punctuation. Also, disable word breaks that were recognized earlier (due to lookahead of @@ -101,10 +98,27 @@ FUNC (const UNIT *s, size_t n, char *p) *last_compchar_ptr = 0; /* *p = 0; */ } + /* Break after Format and Extend characters. */ + else if (last_compchar_prop == WBP_EXTEND + || last_compchar_prop == WBP_FORMAT) + *p = 1; else { + /* Normalize property value to table index, + skipping 5 properties: WBP_EXTEND, + WBP_FORMAT, WBP_NEWLINE, WBP_CR, and + WBP_LF. */ + int last_compchar_prop_index = last_compchar_prop; + int prop_index = prop; + + if (last_compchar_prop_index >= WBP_EXTEND) + last_compchar_prop_index -= 5; + + if (prop_index >= WBP_EXTEND) + prop_index -= 5; + /* Perform a single table lookup. */ - if (uniwbrk_table[last_compchar_prop][prop]) + if (uniwbrk_table[last_compchar_prop_index][prop_index]) *p = 1; /* else *p = 0; */ } diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c index 7cbe4d6..04bd0e5 100644 --- a/lib/uniwbrk/wbrktable.c +++ b/lib/uniwbrk/wbrktable.c @@ -32,21 +32,21 @@ (ALetter | Numeric | Katakana) × ExtendNumLet (WB13a) ExtendNumLet × ExtendNumLet (WB13a) ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b) + Regional_Indicator × Regional_Indicator (WB13c) */ -const unsigned char uniwbrk_table[10][8] = +const unsigned char uniwbrk_table[9][9] = { /* current: OTHER MIDNUMLET NUMERIC */ /* KATAKANA MIDLETTER EXTENDNUMLET */ - /* ALETTER MIDNUM */ + /* ALETTER MIDNUM RI */ /* last */ - /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0 }, - /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0 }, - /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0 }, - /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0 }, - /* WBP_EXTEND */ { 1, 1, 1, 1, 1, 1, 1, 1 }, - /* WBP_FORMAT */ { 1, 1, 1, 1, 1, 1, 1, 1 } + /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0, 1 }, + /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0, 1 }, + /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0, 1 }, + /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0, 1 }, + /* WBP_RI */ { 1, 1, 1, 1, 1, 1, 1, 1, 0 } }; diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h index 1b48adf..50b7823 100644 --- a/lib/uniwbrk/wbrktable.h +++ b/lib/uniwbrk/wbrktable.h @@ -15,4 +15,4 @@ You should have received a copy of the GNU Lesser General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -extern const unsigned char uniwbrk_table[10][8]; +extern const unsigned char uniwbrk_table[9][9]; diff --git a/tests/unigbrk/test-uc-gbrk-prop.c b/tests/unigbrk/test-uc-gbrk-prop.c index 1c71280..4bfbdba 100644 --- a/tests/unigbrk/test-uc-gbrk-prop.c +++ b/tests/unigbrk/test-uc-gbrk-prop.c @@ -50,6 +50,7 @@ graphemebreakproperty_to_string (int gbp) CASE(T) CASE(LV) CASE(LVT) + CASE(RI) } abort (); } diff --git a/tests/unigbrk/test-uc-is-grapheme-break.c b/tests/unigbrk/test-uc-is-grapheme-break.c index a93f6f2..dbaf3dc 100644 --- a/tests/unigbrk/test-uc-is-grapheme-break.c +++ b/tests/unigbrk/test-uc-is-grapheme-break.c @@ -44,6 +44,7 @@ graphemebreakproperty_to_string (int gbp) CASE(T) CASE(LV) CASE(LVT) + CASE(RI) } abort (); } diff --git a/tests/uniwbrk/test-uc-wordbreaks.c b/tests/uniwbrk/test-uc-wordbreaks.c index 87e0e05..710f583 100644 --- a/tests/uniwbrk/test-uc-wordbreaks.c +++ b/tests/uniwbrk/test-uc-wordbreaks.c @@ -44,6 +44,7 @@ wordbreakproperty_to_string (int wbp) CASE(MIDNUM) CASE(NUMERIC) CASE(EXTENDNUMLET) + CASE(RI) } abort (); } -- 2.1.1
