src/hb-glib.cc | 10 ++-- src/hb-icu.cc | 20 +++++--- src/hb-ot-shape-complex-arabic.cc | 6 ++ src/hb-ot-shape-complex-indic.cc | 7 ++ src/hb-ot-shape-complex-misc.cc | 6 ++ src/hb-ot-shape-complex-private.hh | 31 +++++++++++- src/hb-ot-shape-normalize.cc | 92 ++++++++++++++++++++++++++++++------- src/hb-ot-shape-private.hh | 4 - src/hb-ot-shape.cc | 6 +- test/test-unicode.c | 11 +++- 10 files changed, 154 insertions(+), 39 deletions(-)
New commits: commit 63c0ef4a0763e579c9c80887bbfbd2651de05067 Author: Behdad Esfahbod <[email protected]> Date: Thu Jul 21 20:58:42 2011 -0400 Fix decompose() implementations to work with non-starter non-composables Add tests. diff --git a/src/hb-glib.cc b/src/hb-glib.cc index fbf8cf5..76e1dfd 100644 --- a/src/hb-glib.cc +++ b/src/hb-glib.cc @@ -296,16 +296,16 @@ hb_glib_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, *b = 0; ret = *a != ab; } else if (len == 2) { + *a = g_utf8_get_char (normalized); + *b = g_utf8_get_char (g_utf8_next_char (normalized)); /* Here's the ugly part: if ab decomposes to a single character and * that character decomposes again, we have to detect that and undo * the second part :-(. */ gchar *recomposed = g_utf8_normalize (normalized, -1, G_NORMALIZE_NFC); - if (g_utf8_get_char (recomposed) != ab) { - *a = g_utf8_get_char (recomposed); + hb_codepoint_t c = g_utf8_get_char (recomposed); + if (c != ab && c != *a) { + *a = c; *b = 0; - } else { - *a = g_utf8_get_char (normalized); - *b = g_utf8_get_char (g_utf8_next_char (normalized)); } g_free (recomposed); ret = TRUE; diff --git a/src/hb-icu.cc b/src/hb-icu.cc index 7b85cd5..7fe78d2 100644 --- a/src/hb-icu.cc +++ b/src/hb-icu.cc @@ -214,6 +214,10 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, hb_bool_t ret, err; UErrorCode icu_err; + /* This function is a monster! Maybe it wasn't a good idea adding a + * pairwise decompose API... */ + /* Watchout for the dragons. Err, watchout for macros changing len. */ + len = 0; err = FALSE; U16_APPEND (utf16, len, ARRAY_LENGTH (utf16), ab, err); @@ -232,21 +236,23 @@ hb_icu_unicode_decompose (hb_unicode_funcs_t *ufuncs HB_UNUSED, *b = 0; ret = *a != ab; } else if (len == 2) { + len =0; + U16_NEXT_UNSAFE (normalized, len, *a); + U16_NEXT_UNSAFE (normalized, len, *b); + /* Here's the ugly part: if ab decomposes to a single character and * that character decomposes again, we have to detect that and undo * the second part :-(. */ UChar recomposed[20]; icu_err = U_ZERO_ERROR; - len = unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); + unorm_normalize (normalized, len, UNORM_NFC, 0, recomposed, ARRAY_LENGTH (recomposed), &icu_err); if (icu_err) return FALSE; - U16_GET_UNSAFE (recomposed, 0, *a); - if (*a != ab) { + hb_codepoint_t c; + U16_GET_UNSAFE (recomposed, 0, c); + if (c != *a && c != ab) { + *a = c; *b = 0; - } else { - len =0; - U16_NEXT_UNSAFE (normalized, len, *a); - U16_GET_UNSAFE (normalized, len, *b); } ret = TRUE; } else { diff --git a/test/test-unicode.c b/test/test-unicode.c index c614c7d..9f526d7 100644 --- a/test/test-unicode.c +++ b/test/test-unicode.c @@ -800,6 +800,10 @@ test_unicode_normalization (gconstpointer user_data) g_assert (!hb_unicode_compose (uf, 0x2126, 0, &ab) && ab == 0); g_assert (!hb_unicode_compose (uf, 0x03A9, 0, &ab) && ab == 0); + /* Non-starter pairs should not compose */ + g_assert (!hb_unicode_compose (uf, 0x0308, 0x0301, &ab) && ab == 0); /* !0x0344 */ + g_assert (!hb_unicode_compose (uf, 0x0F71, 0x0F72, &ab) && ab == 0); /* !0x0F73 */ + /* Pairs */ g_assert (hb_unicode_compose (uf, 0x0041, 0x030A, &ab) && ab == 0x00C5); g_assert (hb_unicode_compose (uf, 0x006F, 0x0302, &ab) && ab == 0x00F4); @@ -822,12 +826,13 @@ test_unicode_normalization (gconstpointer user_data) g_assert (!hb_unicode_decompose (uf, 0xFB01, &a, &b) && a == 0xFB01 && b == 0); /* Singletons */ - g_assert (hb_unicode_decompose (uf, 0x212B, &a, &b)); - g_assert_cmphex (a, ==, 0x00C5); - g_assert_cmphex (b, ==, 0); g_assert (hb_unicode_decompose (uf, 0x212B, &a, &b) && a == 0x00C5 && b == 0); g_assert (hb_unicode_decompose (uf, 0x2126, &a, &b) && a == 0x03A9 && b == 0); + /* Non-starter pairs decompose, but not compose */ + g_assert (hb_unicode_decompose (uf, 0x0344, &a, &b) && a == 0x0308 && b == 0x0301); + g_assert (hb_unicode_decompose (uf, 0x0F73, &a, &b) && a == 0x0F71 && b == 0x0F72); + /* Pairs */ g_assert (hb_unicode_decompose (uf, 0x00C5, &a, &b) && a == 0x0041 && b == 0x030A); g_assert (hb_unicode_decompose (uf, 0x00F4, &a, &b) && a == 0x006F && b == 0x0302); commit 5d90a342e319068716429bf7af76c3896b61a0e5 Author: Behdad Esfahbod <[email protected]> Date: Thu Jul 21 15:25:01 2011 -0400 Document normalization design diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc index a791e7c..6832779 100644 --- a/src/hb-ot-shape-normalize.cc +++ b/src/hb-ot-shape-normalize.cc @@ -25,20 +25,55 @@ */ #include "hb-ot-shape-private.hh" +#include "hb-ot-shape-complex-private.hh" HB_BEGIN_DECLS +/* + * HIGHLEVEL DESIGN: + * + * This file exports one main function: _hb_ot_shape_normalize(). + * + * This function closely reflects the Unicode Normalization Algorithm, + * yet it's different. The shaper an either prefer decomposed (NFD) or + * composed (NFC). + * + * In general what happens is that: each grapheme is decomposed in a chain + * of 1:2 decompositions, marks reordered, and then recomposed if desires, + * so far it's like Unicode Normalization. However, the decomposition and + * recomposition only happens if the font supports the resulting characters. + * + * The goals are: + * + * - Try to render all canonically equivalent strings similarly. To really + * achieve this we have to always do the full decomposition and then + * selectively recompose from there. It's kinda too expensive though, so + * we skip some cases. For example, if composed is desired, we simply + * don't touch 1-character clusters that are supported by the font, even + * though their NFC may be different. + * + * - When a font has a precomposed character for a sequence but the 'ccmp' + * feature in the font is not adequate, form use the precomposed character + * which typically has better mark positioning. + * + * - When a font does not support a character but supports its decomposition, + * well, use the decomposition. + * + * - The Indic shaper requests decomposed output. This will handle splitting + * matra for the Indic shaper. + */ + static bool get_glyph (hb_ot_shape_context_t *c, unsigned int i) { - hb_buffer_t *b = c->buffer; hb_codepoint_t glyph; - return hb_font_get_glyph (c->font, b->info[i].codepoint, 0, &glyph); + return hb_font_get_glyph (c->font, c->buffer->info[i].codepoint, 0, &glyph); } static bool decompose_single_char_cluster (hb_ot_shape_context_t *c, + bool recompose, unsigned int i) { return FALSE; @@ -46,22 +81,24 @@ decompose_single_char_cluster (hb_ot_shape_context_t *c, static bool handle_single_char_cluster (hb_ot_shape_context_t *c, + bool recompose, unsigned int i) { - /* If the single char is supported by the font, we're good. */ - if (get_glyph (c, i)) + /* If recomposing and the single char is supported by the font, we're good. */ + if (recompose && get_glyph (c, i)) return FALSE; /* Decompose */ - return decompose_single_char_cluster (c, i); + return decompose_single_char_cluster (c, recompose, i); } static bool handle_multi_char_cluster (hb_ot_shape_context_t *c, + bool recompose, unsigned int start, unsigned int end) { - /* If there's a variation-selector, give-up, it's just too hard. */ + /* TODO Currently if there's a variation-selector we give-up, it's just too hard. */ for (unsigned int i = start; i < end; i++) if (unlikely (is_variation_selector (c->buffer->info[i].codepoint))) return FALSE; @@ -70,24 +107,33 @@ handle_multi_char_cluster (hb_ot_shape_context_t *c, } bool -_hb_normalize (hb_ot_shape_context_t *c) +_hb_ot_shape_normalize (hb_ot_shape_context_t *c) { - hb_buffer_t *b = c->buffer; + hb_buffer_t *buffer = c->buffer; bool changed = FALSE; + bool recompose = !hb_ot_shape_complex_prefer_decomposed (c->plan->shaper); + + buffer->clear_output (); + + unsigned int count = buffer->len; + for (buffer->i = 0; buffer->i < count;) + { - unsigned int count = b->len; - for (unsigned int i = 0; i < count;) { unsigned int end; - for (end = i + 1; end < count; end++) - if (b->info[i].cluster != b->info[end].cluster) + for (end = buffer->i + 1; end < count; end++) + if (buffer->info[buffer->i].cluster != buffer->info[end].cluster) break; - if (i + 1 == end) - changed |= handle_single_char_cluster (c, i); + + if (buffer->i + 1 == end) + changed |= handle_single_char_cluster (c, recompose, buffer->i); else - changed |= handle_multi_char_cluster (c, i, end); - i = end; + changed |= handle_multi_char_cluster (c, recompose, buffer->i, end); + while (buffer->i < end) + c->buffer->next_glyph (); } + buffer->swap (); + return changed; } diff --git a/src/hb-ot-shape-private.hh b/src/hb-ot-shape-private.hh index 96c436d..17b3c99 100644 --- a/src/hb-ot-shape-private.hh +++ b/src/hb-ot-shape-private.hh @@ -100,7 +100,7 @@ is_variation_selector (hb_codepoint_t unicode) } -HB_INTERNAL bool _hb_normalize (hb_ot_shape_context_t *c); +HB_INTERNAL bool _hb_ot_shape_normalize (hb_ot_shape_context_t *c); HB_END_DECLS diff --git a/src/hb-ot-shape.cc b/src/hb-ot-shape.cc index bffd075..d1c495f 100644 --- a/src/hb-ot-shape.cc +++ b/src/hb-ot-shape.cc @@ -254,11 +254,13 @@ static void hb_map_glyphs (hb_font_t *font, hb_buffer_t *buffer) { + hb_codepoint_t glyph; + if (unlikely (!buffer->len)) return; - hb_codepoint_t glyph; buffer->clear_output (); + unsigned int count = buffer->len - 1; for (buffer->i = 0; buffer->i < count;) { if (unlikely (is_variation_selector (buffer->info[buffer->i + 1].codepoint))) { @@ -363,7 +365,7 @@ hb_ot_shape_execute_internal (hb_ot_shape_context_t *c) hb_ensure_native_direction (c->buffer); - if (_hb_normalize (c)) + if (_hb_ot_shape_normalize (c)) /* Buffer contents changed, reset unicode_props */ hb_set_unicode_props (c->buffer); /* BUFFER: Set general_category and combining_class in var1 */ commit 02cdf743c2ec345a44d4fcf865594b6ac13fccd0 Author: Behdad Esfahbod <[email protected]> Date: Thu Jul 21 12:23:12 2011 -0400 Add prefer_decomposed() complex-shaper callback This allows the Indic shaper to request decomposed characters. This will handle split matra for free. Other shapers prefer precomposed characters. diff --git a/src/hb-ot-shape-complex-arabic.cc b/src/hb-ot-shape-complex-arabic.cc index 53e7a9b..dc63db2 100644 --- a/src/hb-ot-shape-complex-arabic.cc +++ b/src/hb-ot-shape-complex-arabic.cc @@ -183,6 +183,12 @@ _hb_ot_shape_complex_collect_features_arabic (hb_ot_map_builder_t *map, const hb map->add_bool_feature (HB_TAG('c','s','w','h')); } +bool +_hb_ot_shape_complex_prefer_decomposed_arabic (void) +{ + return FALSE; +} + void _hb_ot_shape_complex_setup_masks_arabic (hb_ot_map_t *map, hb_buffer_t *buffer) { diff --git a/src/hb-ot-shape-complex-indic.cc b/src/hb-ot-shape-complex-indic.cc index 03ea10f..cf5a049 100644 --- a/src/hb-ot-shape-complex-indic.cc +++ b/src/hb-ot-shape-complex-indic.cc @@ -309,6 +309,13 @@ _hb_ot_shape_complex_collect_features_indic (hb_ot_map_builder_t *map, const hb_ } +bool +_hb_ot_shape_complex_prefer_decomposed_indic (void) +{ + /* We want split matras decomposed by the common shaping logic. */ + return TRUE; +} + static void found_syllable (hb_ot_map_t *map, hb_buffer_t *buffer, unsigned int start, unsigned int end) diff --git a/src/hb-ot-shape-complex-misc.cc b/src/hb-ot-shape-complex-misc.cc index 92dee49..b2de3ff 100644 --- a/src/hb-ot-shape-complex-misc.cc +++ b/src/hb-ot-shape-complex-misc.cc @@ -42,6 +42,12 @@ _hb_ot_shape_complex_collect_features_default (hb_ot_map_builder_t *map, const h { } +bool +_hb_ot_shape_complex_prefer_decomposed_default (void) +{ + return FALSE; +} + void _hb_ot_shape_complex_setup_masks_default (hb_ot_map_t *map, hb_buffer_t *buffer) { diff --git a/src/hb-ot-shape-complex-private.hh b/src/hb-ot-shape-complex-private.hh index c10fdf9..4bfd855 100644 --- a/src/hb-ot-shape-complex-private.hh +++ b/src/hb-ot-shape-complex-private.hh @@ -140,7 +140,34 @@ hb_ot_shape_complex_collect_features (hb_ot_complex_shaper_t shaper, switch (shaper) { default: #define HB_COMPLEX_SHAPER_IMPLEMENT(name) \ - case hb_ot_complex_shaper_##name: _hb_ot_shape_complex_collect_features_##name (map, props); return; + case hb_ot_complex_shaper_##name: _hb_ot_shape_complex_collect_features_##name (map, props); return; + HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS +#undef HB_COMPLEX_SHAPER_IMPLEMENT + } +} + + +/* + * prefer_decomposed() + * + * Called during shape_execute(). + * + * Shapers should return TRUE if it prefers decomposed (NFD) input rather than precomposed (NFC). + */ + +typedef bool hb_ot_shape_complex_prefer_decomposed_func_t (void); +#define HB_COMPLEX_SHAPER_IMPLEMENT(name) \ + HB_INTERNAL hb_ot_shape_complex_prefer_decomposed_func_t _hb_ot_shape_complex_prefer_decomposed_##name; + HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS +#undef HB_COMPLEX_SHAPER_IMPLEMENT + +static inline bool +hb_ot_shape_complex_prefer_decomposed (hb_ot_complex_shaper_t shaper) +{ + switch (shaper) { + default: +#define HB_COMPLEX_SHAPER_IMPLEMENT(name) \ + case hb_ot_complex_shaper_##name: return _hb_ot_shape_complex_prefer_decomposed_##name (); HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS #undef HB_COMPLEX_SHAPER_IMPLEMENT } @@ -168,7 +195,7 @@ hb_ot_shape_complex_setup_masks (hb_ot_complex_shaper_t shaper, switch (shaper) { default: #define HB_COMPLEX_SHAPER_IMPLEMENT(name) \ - case hb_ot_complex_shaper_##name: _hb_ot_shape_complex_setup_masks_##name (map, buffer); return; + case hb_ot_complex_shaper_##name: _hb_ot_shape_complex_setup_masks_##name (map, buffer); return; HB_COMPLEX_SHAPERS_IMPLEMENT_SHAPERS #undef HB_COMPLEX_SHAPER_IMPLEMENT } commit d6b9c6d20041b4f4fa11befc179aee757c41904d Author: Behdad Esfahbod <[email protected]> Date: Thu Jul 21 12:16:45 2011 -0400 More kicking diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc index 0a245b0..a791e7c 100644 --- a/src/hb-ot-shape-normalize.cc +++ b/src/hb-ot-shape-normalize.cc @@ -38,23 +38,34 @@ get_glyph (hb_ot_shape_context_t *c, unsigned int i) } static bool +decompose_single_char_cluster (hb_ot_shape_context_t *c, + unsigned int i) +{ + return FALSE; +} + +static bool handle_single_char_cluster (hb_ot_shape_context_t *c, unsigned int i) { + /* If the single char is supported by the font, we're good. */ if (get_glyph (c, i)) return FALSE; /* Decompose */ - - return FALSE; + return decompose_single_char_cluster (c, i); } static bool handle_multi_char_cluster (hb_ot_shape_context_t *c, - unsigned int i, + unsigned int start, unsigned int end) { /* If there's a variation-selector, give-up, it's just too hard. */ + for (unsigned int i = start; i < end; i++) + if (unlikely (is_variation_selector (c->buffer->info[i].codepoint))) + return FALSE; + return FALSE; } commit 192445aef2e50087049243ce54ce7059ec441ffa Author: Behdad Esfahbod <[email protected]> Date: Thu Jul 21 12:13:04 2011 -0400 Remove intermittent_glyph() Lets not worry about performance for now... diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc index f6e962c..0a245b0 100644 --- a/src/hb-ot-shape-normalize.cc +++ b/src/hb-ot-shape-normalize.cc @@ -32,8 +32,9 @@ static bool get_glyph (hb_ot_shape_context_t *c, unsigned int i) { hb_buffer_t *b = c->buffer; + hb_codepoint_t glyph; - return hb_font_get_glyph (c->font, b->info[i].codepoint, 0, &b->info[i].intermittent_glyph()); + return hb_font_get_glyph (c->font, b->info[i].codepoint, 0, &glyph); } static bool diff --git a/src/hb-ot-shape-private.hh b/src/hb-ot-shape-private.hh index 03dd4ed..96c436d 100644 --- a/src/hb-ot-shape-private.hh +++ b/src/hb-ot-shape-private.hh @@ -102,8 +102,6 @@ is_variation_selector (hb_codepoint_t unicode) HB_INTERNAL bool _hb_normalize (hb_ot_shape_context_t *c); -#define intermittent_glyph() var2.u32 - HB_END_DECLS #endif /* HB_OT_SHAPE_PRIVATE_HH */ _______________________________________________ HarfBuzz mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/harfbuzz
