docs/usermanual-clusters.xml | 772 ++++++++++++++++++++++++++-------------- src/hb-aat-layout-kerx-table.hh | 5 src/hb-aat-layout-morx-table.hh | 5 src/hb-aat-map.hh | 4 src/hb-dsalgs.hh | 110 +++++ src/hb-face.cc | 6 src/hb-open-type.hh | 106 ++--- src/hb-ot-map.hh | 4 src/hb-set.hh | 2 src/hb-vector.hh | 88 +--- 10 files changed, 698 insertions(+), 404 deletions(-)
New commits: commit 8dcc1913a1670ede7b124f7b5b775d7ab8791386 Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 09:47:45 2018 -0500 [kerx/morx] Make sure object length is sanitized before accessing it diff --git a/src/hb-aat-layout-kerx-table.hh b/src/hb-aat-layout-kerx-table.hh index 521c4c72..2d548932 100644 --- a/src/hb-aat-layout-kerx-table.hh +++ b/src/hb-aat-layout-kerx-table.hh @@ -962,6 +962,11 @@ struct KerxTable unsigned int count = thiz()->tableCount; for (unsigned int i = 0; i < count; i++) { + if (unlikely (!st->u.header.sanitize (c))) + { + c->reset_object (); + return_trace (false); + } /* OpenType kern table has 2-byte subtable lengths. That's limiting. * MS implementation also only supports one subtable, of format 0, * anyway. Certain versions of some fonts, like Calibry, contain diff --git a/src/hb-aat-layout-morx-table.hh b/src/hb-aat-layout-morx-table.hh index 7a39eea8..5b44a4cf 100644 --- a/src/hb-aat-layout-morx-table.hh +++ b/src/hb-aat-layout-morx-table.hh @@ -1061,6 +1061,11 @@ struct Chain unsigned int count = subtableCount; for (unsigned int i = 0; i < count; i++) { + if (unlikely (!c->check_struct (subtable))) + { + c->reset_object (); + return_trace (false); + } c->set_object (*subtable); if (!subtable->sanitize (c)) return_trace (false); commit 70d80c90fe2f4eca66bec3e1d313bbf7e4d0ab65 Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:59:50 2018 -0500 [arrays] Port ArrayOf.qsort() and hb_vector_t.qsort() to hb_array_t diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh index 87606072..cc1a1d42 100644 --- a/src/hb-dsalgs.hh +++ b/src/hb-dsalgs.hh @@ -560,6 +560,9 @@ struct hb_bytes_t }; template <typename Type> +struct hb_sorted_array_t; + +template <typename Type> struct hb_array_t { static_assert ((bool) (unsigned) hb_static_size (Type), ""); @@ -600,13 +603,20 @@ struct hb_array_t return not_found; } - inline void qsort (int (*cmp)(const void*, const void*)) + inline hb_sorted_array_t<Type> qsort (int (*cmp)(const void*, const void*)) { ::qsort (arrayZ, len, sizeof (Type), cmp); + return hb_sorted_array_t<Type> (*this); + } + inline hb_sorted_array_t<Type> qsort (void) + { + ::qsort (arrayZ, len, sizeof (Type), Type::cmp); + return hb_sorted_array_t<Type> (*this); } - inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1) + inline void qsort (unsigned int start, unsigned int end) { end = MIN (end, len); + assert (start <= end); ::qsort (arrayZ + start, end - start, sizeof (Type), Type::cmp); } diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh index 4704861d..c7154c42 100644 --- a/src/hb-open-type.hh +++ b/src/hb-open-type.hh @@ -375,6 +375,16 @@ struct UnsizedArrayOf inline hb_array_t<const Type> as_array (unsigned int len) const { return hb_array (arrayZ, len); } + template <typename T> + inline Type &lsearch (unsigned int len, const T &x) + { return *as_array (len).lsearch (x, &Crap (T)); } + template <typename T> + inline const Type &lsearch (unsigned int len, const T &x) const + { return *as_array (len).lsearch (x, &Null (T)); } + + inline void qsort (unsigned int len, unsigned int start = 0, unsigned int end = (unsigned int) -1) + { as_array (len).qsort (start, end); } + inline bool sanitize (hb_sanitize_context_t *c, unsigned int count) const { TRACE_SANITIZE (this); @@ -577,8 +587,8 @@ struct ArrayOf inline const Type &lsearch (const T &x) const { return *as_array ().lsearch (x, &Null (T)); } - inline void qsort (void) - { as_array ().qsort (); } + inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1) + { as_array ().qsort (start, end); } inline bool sanitize_shallow (hb_sanitize_context_t *c) const { commit 073d837aa2394d29dda72679802d583c559c3c5b Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:48:48 2018 -0500 [arrays] Port ArrayOf.qsort() to hb_array_t's diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh index eaefc3bc..4704861d 100644 --- a/src/hb-open-type.hh +++ b/src/hb-open-type.hh @@ -578,9 +578,7 @@ struct ArrayOf { return *as_array ().lsearch (x, &Null (T)); } inline void qsort (void) - { - ::qsort (arrayZ, len, sizeof (Type), Type::cmp); - } + { as_array ().qsort (); } inline bool sanitize_shallow (hb_sanitize_context_t *c) const { commit ad5c871d801b481f95dd32c8b65ecc70def597be Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:47:49 2018 -0500 [arrays] Add copy-constructor to hb_array_t and hb_sorted_array_t diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh index ef030732..87606072 100644 --- a/src/hb-dsalgs.hh +++ b/src/hb-dsalgs.hh @@ -564,6 +564,7 @@ struct hb_array_t { static_assert ((bool) (unsigned) hb_static_size (Type), ""); + inline hb_array_t (const hb_array_t &o) : arrayZ (o.arrayZ), len (o.len) {} inline hb_array_t (Type *array_, unsigned int len_) : arrayZ (array_), len (len_) {} inline Type& operator [] (unsigned int i) const @@ -642,6 +643,7 @@ inline hb_array_t<T> hb_array (T *array, unsigned int len) template <typename Type> struct hb_sorted_array_t : hb_array_t<Type> { + inline hb_sorted_array_t (const hb_array_t<Type> &o) : hb_array_t<Type> (o) {} inline hb_sorted_array_t (Type *array_, unsigned int len_) : hb_array_t<Type> (array_, len_) {} template <typename T> commit 61de55bf496c1edb120e4d096140eb1125552bbe Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:45:58 2018 -0500 [arrays] Port hb_vector_t.qsort() to hb_array_t's diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh index e66f2b4c..ef030732 100644 --- a/src/hb-dsalgs.hh +++ b/src/hb-dsalgs.hh @@ -572,6 +572,10 @@ struct hb_array_t return arrayZ[i]; } + template <typename T> inline operator T * (void) const { return arrayZ; } + + inline Type * operator & (void) const { return arrayZ; } + inline unsigned int get_size (void) const { return len * sizeof (Type); } template <typename T> @@ -595,9 +599,15 @@ struct hb_array_t return not_found; } - template <typename T> inline operator T * (void) const { return arrayZ; } - - inline Type * operator & (void) const { return arrayZ; } + inline void qsort (int (*cmp)(const void*, const void*)) + { + ::qsort (arrayZ, len, sizeof (Type), cmp); + } + inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1) + { + end = MIN (end, len); + ::qsort (arrayZ + start, end - start, sizeof (Type), Type::cmp); + } inline hb_array_t<Type> sub_array (unsigned int start_offset, unsigned int seg_count) const { diff --git a/src/hb-vector.hh b/src/hb-vector.hh index a5fa4142..a8c98d22 100644 --- a/src/hb-vector.hh +++ b/src/hb-vector.hh @@ -221,15 +221,9 @@ struct hb_vector_t } inline void qsort (int (*cmp)(const void*, const void*)) - { - ::qsort (arrayZ(), len, sizeof (Type), cmp); - } - + { as_array ().qsort (cmp); } inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1) - { - end = MIN (end, len); - ::qsort (arrayZ() + start, end - start, sizeof (Type), Type::cmp); - } + { as_array ().qsort (start, end); } template <typename T> inline Type *lsearch (const T &x, Type *not_found = nullptr) commit e3face8e791d677f94154e8a7f3d787d0d69a02f Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:42:17 2018 -0500 [arrays] Remove one flavor of hb_vector_t.qsort() diff --git a/src/hb-vector.hh b/src/hb-vector.hh index 085523f8..a5fa4142 100644 --- a/src/hb-vector.hh +++ b/src/hb-vector.hh @@ -225,13 +225,9 @@ struct hb_vector_t ::qsort (arrayZ(), len, sizeof (Type), cmp); } - inline void qsort (void) - { - ::qsort (arrayZ(), len, sizeof (Type), Type::cmp); - } - - inline void qsort (unsigned int start, unsigned int end) + inline void qsort (unsigned int start = 0, unsigned int end = (unsigned int) -1) { + end = MIN (end, len); ::qsort (arrayZ() + start, end - start, sizeof (Type), Type::cmp); } commit 7c1600dcd9813ca560ecccc5c54877a5750caf4e Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:37:11 2018 -0500 [arrays] Add (unused) SortedUnsizedArrayOf<> diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh index 0ea88431..eaefc3bc 100644 --- a/src/hb-open-type.hh +++ b/src/hb-open-type.hh @@ -449,6 +449,27 @@ struct UnsizedOffsetListOf : UnsizedOffsetArrayOf<Type, OffsetType, has_null> } }; +/* An array with sorted elements. Supports binary searching. */ +template <typename Type> +struct SortedUnsizedArrayOf : UnsizedArrayOf<Type> +{ + inline hb_sorted_array_t<Type> as_array (unsigned int len) + { return hb_sorted_array (this->arrayZ, len); } + inline hb_sorted_array_t<const Type> as_array (unsigned int len) const + { return hb_sorted_array (this->arrayZ, len); } + + template <typename T> + inline Type &bsearch (unsigned int len, const T &x) + { return *as_array (len).bsearch (x, &Crap (Type)); } + template <typename T> + inline const Type &bsearch (unsigned int len, const T &x) const + { return *as_array (len).bsearch (x, &Null (Type)); } + template <typename T> + inline bool bfind (unsigned int len, const T &x, unsigned int *i = nullptr) const + { return as_array (len).bfind (x, i); } +}; + + /* An array with a number of elements. */ template <typename Type, typename LenType=HBUINT16> struct ArrayOf commit e700392f5cbf366f1e03dc7e7b1a2eb6c3027b92 Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:31:00 2018 -0500 [arrays] Port SortedArrayOf.bsearch/bfind to hb_sorted_array_t's diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh index a7592828..e66f2b4c 100644 --- a/src/hb-dsalgs.hh +++ b/src/hb-dsalgs.hh @@ -635,22 +635,19 @@ struct hb_sorted_array_t : hb_array_t<Type> inline hb_sorted_array_t (Type *array_, unsigned int len_) : hb_array_t<Type> (array_, len_) {} template <typename T> - inline Type *bsearch (const T &x, - Type *not_found = nullptr) + inline Type *bsearch (const T &x, Type *not_found = nullptr) { unsigned int i; return bfind (x, &i) ? &this->arrayZ[i] : not_found; } template <typename T> - inline const Type *bsearch (const T &x, - const Type *not_found = nullptr) const + inline const Type *bsearch (const T &x, const Type *not_found = nullptr) const { unsigned int i; return bfind (x, &i) ? &this->arrayZ[i] : not_found; } template <typename T> - inline bool bfind (const T &x, - unsigned int *i = nullptr) const + inline bool bfind (const T &x, unsigned int *i = nullptr) const { int min = 0, max = (int) this->len - 1; const Type *array = this->arrayZ; diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh index e384a7ad..0ea88431 100644 --- a/src/hb-open-type.hh +++ b/src/hb-open-type.hh @@ -551,14 +551,10 @@ struct ArrayOf template <typename T> inline Type &lsearch (const T &x) - { - return *as_array ().lsearch (x, &Crap (T)); - } + { return *as_array ().lsearch (x, &Crap (T)); } template <typename T> inline const Type &lsearch (const T &x) const - { - return *as_array ().lsearch (x, &Null (T)); - } + { return *as_array ().lsearch (x, &Null (T)); } inline void qsort (void) { @@ -745,46 +741,20 @@ struct ArrayOfM1 template <typename Type, typename LenType=HBUINT16> struct SortedArrayOf : ArrayOf<Type, LenType> { + inline hb_sorted_array_t<Type> as_array (void) + { return hb_sorted_array (this->arrayZ, this->len); } + inline hb_sorted_array_t<const Type> as_array (void) const + { return hb_sorted_array (this->arrayZ, this->len); } + template <typename T> inline Type &bsearch (const T &x) - { - unsigned int i; - return bfind (x, &i) ? this->arrayZ[i] : Crap(Type); - } + { return *as_array ().bsearch (x, &Crap (Type)); } template <typename T> inline const Type &bsearch (const T &x) const - { - unsigned int i; - return bfind (x, &i) ? this->arrayZ[i] : Null(Type); - } + { return *as_array ().bsearch (x, &Null (Type)); } template <typename T> inline bool bfind (const T &x, unsigned int *i = nullptr) const - { - int min = 0, max = (int) this->len - 1; - const Type *array = this->arrayZ; - while (min <= max) - { - int mid = ((unsigned int) min + (unsigned int) max) / 2; - int c = array[mid].cmp (x); - if (c < 0) - max = mid - 1; - else if (c > 0) - min = mid + 1; - else - { - if (i) - *i = mid; - return true; - } - } - if (i) - { - if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0)) - max++; - *i = max; - } - return false; - } + { return as_array ().bfind (x, i); } }; /* diff --git a/src/hb-vector.hh b/src/hb-vector.hh index 14967a41..085523f8 100644 --- a/src/hb-vector.hh +++ b/src/hb-vector.hh @@ -236,36 +236,21 @@ struct hb_vector_t } template <typename T> - inline Type *lsearch (const T &x, - Type *not_found = nullptr) - { - return as_array ().lsearch (x, not_found); - } + inline Type *lsearch (const T &x, Type *not_found = nullptr) + { return as_array ().lsearch (x, not_found); } template <typename T> - inline const Type *lsearch (const T &x, - const Type *not_found = nullptr) const - { - return as_array ().lsearch (x, not_found); - } + inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const + { return as_array ().lsearch (x, not_found); } template <typename T> - inline Type *bsearch (const T &x, - Type *not_found = nullptr) - { - return as_sorted_array ().bsearch (x, not_found); - } + inline Type *bsearch (const T &x, Type *not_found = nullptr) + { return as_sorted_array ().bsearch (x, not_found); } template <typename T> - inline const Type *bsearch (const T &x, - const Type *not_found = nullptr) const - { - return as_sorted_array ().bsearch (x, not_found); - } + inline const Type *bsearch (const T &x, const Type *not_found = nullptr) const + { return as_sorted_array ().bsearch (x, not_found); } template <typename T> - inline bool bfind (const T &x, - unsigned int *i = nullptr) const - { - return as_sorted_array ().bfind (x, i); - } + inline bool bfind (const T &x, unsigned int *i = nullptr) const + { return as_sorted_array ().bfind (x, i); } }; commit e604306f2829804e9016966c1378166253b19d29 Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:24:48 2018 -0500 [arrays] Port hb_vector_t.bsearch/bfind to (new) hb_sorted_array_t's diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh index 1d5bb352..a7592828 100644 --- a/src/hb-dsalgs.hh +++ b/src/hb-dsalgs.hh @@ -564,7 +564,6 @@ struct hb_array_t { static_assert ((bool) (unsigned) hb_static_size (Type), ""); - inline hb_array_t (void) : arrayZ (nullptr), len (0) {} inline hb_array_t (Type *array_, unsigned int len_) : arrayZ (array_), len (len_) {} inline Type& operator [] (unsigned int i) const @@ -576,7 +575,8 @@ struct hb_array_t inline unsigned int get_size (void) const { return len * sizeof (Type); } template <typename T> - inline Type *lsearch (const T &x, Type *not_found = nullptr) + inline Type *lsearch (const T &x, + Type *not_found = nullptr) { unsigned int count = len; for (unsigned int i = 0; i < count; i++) @@ -585,7 +585,8 @@ struct hb_array_t return not_found; } template <typename T> - inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const + inline const Type *lsearch (const T &x, + const Type *not_found = nullptr) const { unsigned int count = len; for (unsigned int i = 0; i < count; i++) @@ -625,7 +626,61 @@ struct hb_array_t unsigned int len; }; template <typename T> -inline hb_array_t<T> hb_array (T *array, unsigned int len) { return hb_array_t<T> (array, len); } +inline hb_array_t<T> hb_array (T *array, unsigned int len) +{ return hb_array_t<T> (array, len); } + +template <typename Type> +struct hb_sorted_array_t : hb_array_t<Type> +{ + inline hb_sorted_array_t (Type *array_, unsigned int len_) : hb_array_t<Type> (array_, len_) {} + + template <typename T> + inline Type *bsearch (const T &x, + Type *not_found = nullptr) + { + unsigned int i; + return bfind (x, &i) ? &this->arrayZ[i] : not_found; + } + template <typename T> + inline const Type *bsearch (const T &x, + const Type *not_found = nullptr) const + { + unsigned int i; + return bfind (x, &i) ? &this->arrayZ[i] : not_found; + } + template <typename T> + inline bool bfind (const T &x, + unsigned int *i = nullptr) const + { + int min = 0, max = (int) this->len - 1; + const Type *array = this->arrayZ; + while (min <= max) + { + int mid = ((unsigned int) min + (unsigned int) max) / 2; + int c = array[mid].cmp (x); + if (c < 0) + max = mid - 1; + else if (c > 0) + min = mid + 1; + else + { + if (i) + *i = mid; + return true; + } + } + if (i) + { + if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0)) + max++; + *i = max; + } + return false; + } +}; +template <typename T> +inline hb_sorted_array_t<T> hb_sorted_array (T *array, unsigned int len) +{ return hb_sorted_array_t<T> (array, len); } struct HbOpOr diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh index 7fa38b1c..e384a7ad 100644 --- a/src/hb-open-type.hh +++ b/src/hb-open-type.hh @@ -370,8 +370,10 @@ struct UnsizedArrayOf inline unsigned int get_size (unsigned int len) const { return len * Type::static_size; } - inline hb_array_t<Type> as_array (unsigned int len) { return hb_array_t<Type> (arrayZ, len); } - inline hb_array_t<const Type> as_array (unsigned int len) const { return hb_array_t<const Type> (arrayZ, len); } + inline hb_array_t<Type> as_array (unsigned int len) + { return hb_array (arrayZ, len); } + inline hb_array_t<const Type> as_array (unsigned int len) const + { return hb_array (arrayZ, len); } inline bool sanitize (hb_sanitize_context_t *c, unsigned int count) const { @@ -483,8 +485,10 @@ struct ArrayOf inline unsigned int get_size (void) const { return len.static_size + len * Type::static_size; } - inline hb_array_t<Type> as_array (void) { return hb_array_t<Type> (arrayZ, len); } - inline hb_array_t<const Type> as_array (void) const { return hb_array_t<const Type> (arrayZ, len); } + inline hb_array_t<Type> as_array (void) + { return hb_array (arrayZ, len); } + inline hb_array_t<const Type> as_array (void) const + { return hb_array (arrayZ, len); } inline bool serialize (hb_serialize_context_t *c, unsigned int items_len) diff --git a/src/hb-vector.hh b/src/hb-vector.hh index d1fb65df..14967a41 100644 --- a/src/hb-vector.hh +++ b/src/hb-vector.hh @@ -91,8 +91,15 @@ struct hb_vector_t return arrayZ()[i]; } - inline hb_array_t<Type> as_array (void) { return hb_array_t<Type> (arrayZ(), len); } - inline hb_array_t<const Type> as_array (void) const { return hb_array_t<const Type> (arrayZ(), len); } + inline hb_array_t<Type> as_array (void) + { return hb_array (arrayZ(), len); } + inline hb_array_t<const Type> as_array (void) const + { return hb_array (arrayZ(), len); } + + inline hb_sorted_array_t<Type> as_sorted_array (void) + { return hb_sorted_array (arrayZ(), len); } + inline hb_sorted_array_t<const Type> as_sorted_array (void) const + { return hb_sorted_array (arrayZ(), len); } template <typename T> inline operator T * (void) { return arrayZ(); } template <typename T> inline operator const T * (void) const { return arrayZ(); } @@ -229,55 +236,35 @@ struct hb_vector_t } template <typename T> - inline Type *lsearch (const T &x, Type *not_found = nullptr) + inline Type *lsearch (const T &x, + Type *not_found = nullptr) { return as_array ().lsearch (x, not_found); } template <typename T> - inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const + inline const Type *lsearch (const T &x, + const Type *not_found = nullptr) const { return as_array ().lsearch (x, not_found); } template <typename T> - inline Type *bsearch (const T &x) + inline Type *bsearch (const T &x, + Type *not_found = nullptr) { - unsigned int i; - return bfind (x, &i) ? &arrayZ()[i] : nullptr; + return as_sorted_array ().bsearch (x, not_found); } template <typename T> - inline const Type *bsearch (const T &x) const + inline const Type *bsearch (const T &x, + const Type *not_found = nullptr) const { - unsigned int i; - return bfind (x, &i) ? &arrayZ()[i] : nullptr; + return as_sorted_array ().bsearch (x, not_found); } template <typename T> - inline bool bfind (const T &x, unsigned int *i = nullptr) const + inline bool bfind (const T &x, + unsigned int *i = nullptr) const { - int min = 0, max = (int) this->len - 1; - const Type *array = this->arrayZ(); - while (min <= max) - { - int mid = ((unsigned int) min + (unsigned int) max) / 2; - int c = array[mid].cmp (x); - if (c < 0) - max = mid - 1; - else if (c > 0) - min = mid + 1; - else - { - if (i) - *i = mid; - return true; - } - } - if (i) - { - if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0)) - max++; - *i = max; - } - return false; + return as_sorted_array ().bfind (x, i); } }; commit 268eca24921e85eda98f4f0cce05d40c7235ba62 Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:11:12 2018 -0500 [arrays] Port (unused) ArrayOf.lsearch() to hb_array_t's diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh index 4e0c033f..7fa38b1c 100644 --- a/src/hb-open-type.hh +++ b/src/hb-open-type.hh @@ -548,20 +548,12 @@ struct ArrayOf template <typename T> inline Type &lsearch (const T &x) { - unsigned int count = len; - for (unsigned int i = 0; i < count; i++) - if (!this->arrayZ[i].cmp (x)) - return this->arrayZ[i]; - return Crap (T); + return *as_array ().lsearch (x, &Crap (T)); } template <typename T> inline const Type &lsearch (const T &x) const { - unsigned int count = len; - for (unsigned int i = 0; i < count; i++) - if (!this->arrayZ[i].cmp (x)) - return this->arrayZ[i]; - return Null (T); + return *as_array ().lsearch (x, &Null (T)); } inline void qsort (void) commit 830856ba6b9454bf507e00416f9d45e9975fb7dc Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:09:28 2018 -0500 [arrays] Port hb_vector_t.lsearch() to hb_array_t's diff --git a/src/hb-dsalgs.hh b/src/hb-dsalgs.hh index c133a532..1d5bb352 100644 --- a/src/hb-dsalgs.hh +++ b/src/hb-dsalgs.hh @@ -575,9 +575,24 @@ struct hb_array_t inline unsigned int get_size (void) const { return len * sizeof (Type); } - template <typename hb_sanitize_context_t> - inline bool sanitize (hb_sanitize_context_t *c) const - { return c->check_array (arrayZ, len); } + template <typename T> + inline Type *lsearch (const T &x, Type *not_found = nullptr) + { + unsigned int count = len; + for (unsigned int i = 0; i < count; i++) + if (!this->arrayZ[i].cmp (x)) + return &this->arrayZ[i]; + return not_found; + } + template <typename T> + inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const + { + unsigned int count = len; + for (unsigned int i = 0; i < count; i++) + if (!this->arrayZ[i].cmp (x)) + return &this->arrayZ[i]; + return not_found; + } template <typename T> inline operator T * (void) const { return arrayZ; } @@ -601,6 +616,11 @@ struct hb_array_t inline void free (void) { ::free ((void *) arrayZ); arrayZ = nullptr; len = 0; } + template <typename hb_sanitize_context_t> + inline bool sanitize (hb_sanitize_context_t *c) const + { return c->check_array (arrayZ, len); } + + public: Type *arrayZ; unsigned int len; }; diff --git a/src/hb-vector.hh b/src/hb-vector.hh index e17f8897..d1fb65df 100644 --- a/src/hb-vector.hh +++ b/src/hb-vector.hh @@ -229,22 +229,14 @@ struct hb_vector_t } template <typename T> - inline Type *lsearch (const T &x) + inline Type *lsearch (const T &x, Type *not_found = nullptr) { - Type *array = arrayZ(); - for (unsigned int i = 0; i < len; i++) - if (0 == array[i].cmp (x)) - return &array[i]; - return nullptr; + return as_array ().lsearch (x, not_found); } template <typename T> - inline const Type *lsearch (const T &x) const + inline const Type *lsearch (const T &x, const Type *not_found = nullptr) const { - const Type *array = arrayZ(); - for (unsigned int i = 0; i < len; i++) - if (0 == array[i].cmp (x)) - return &array[i]; - return nullptr; + return as_array ().lsearch (x, not_found); } template <typename T> commit 96cf0889804b7d72a96274b25641bb18f7dd2e1e Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 01:07:15 2018 -0500 [arrays] More diff --git a/src/hb-face.cc b/src/hb-face.cc index e23842fb..5b33784f 100644 --- a/src/hb-face.cc +++ b/src/hb-face.cc @@ -588,10 +588,10 @@ struct hb_face_builder_data_t { struct table_entry_t { - inline int cmp (const hb_tag_t *t) const + inline int cmp (hb_tag_t t) const { - if (*t < tag) return -1; - if (*t > tag) return -1; + if (t < tag) return -1; + if (t > tag) return -1; return 0; } diff --git a/src/hb-vector.hh b/src/hb-vector.hh index 436b92bb..e17f8897 100644 --- a/src/hb-vector.hh +++ b/src/hb-vector.hh @@ -233,7 +233,7 @@ struct hb_vector_t { Type *array = arrayZ(); for (unsigned int i = 0; i < len; i++) - if (0 == array[i].cmp (&x)) + if (0 == array[i].cmp (x)) return &array[i]; return nullptr; } @@ -242,7 +242,7 @@ struct hb_vector_t { const Type *array = arrayZ(); for (unsigned int i = 0; i < len; i++) - if (0 == array[i].cmp (&x)) + if (0 == array[i].cmp (x)) return &array[i]; return nullptr; } commit 3e26c8d2b10fc08642c25c7f13aef68b0b1008f6 Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 00:58:44 2018 -0500 [arrays] Update ArrayOf.lsearch() Currently unused apparently diff --git a/src/hb-open-type.hh b/src/hb-open-type.hh index f255ea12..4e0c033f 100644 --- a/src/hb-open-type.hh +++ b/src/hb-open-type.hh @@ -545,14 +545,23 @@ struct ArrayOf return_trace (true); } - template <typename SearchType> - inline int lsearch (const SearchType &x) const + template <typename T> + inline Type &lsearch (const T &x) + { + unsigned int count = len; + for (unsigned int i = 0; i < count; i++) + if (!this->arrayZ[i].cmp (x)) + return this->arrayZ[i]; + return Crap (T); + } + template <typename T> + inline const Type &lsearch (const T &x) const { unsigned int count = len; for (unsigned int i = 0; i < count; i++) if (!this->arrayZ[i].cmp (x)) - return i; - return -1; + return this->arrayZ[i]; + return Null (T); } inline void qsort (void) commit 22e1857b01c71714245ddca05cb3fa0127bf7da2 Author: Behdad Esfahbod <beh...@behdad.org> Date: Sat Nov 24 00:53:19 2018 -0500 [arrays] Change argument type of cmp called by hb_vector_t.bsearch() Towards consolidating all array bsearch/... diff --git a/src/hb-aat-map.hh b/src/hb-aat-map.hh index 07454b2c..7d85c7c3 100644 --- a/src/hb-aat-map.hh +++ b/src/hb-aat-map.hh @@ -77,9 +77,9 @@ struct hb_aat_map_builder_t (a->seq < b->seq ? -1 : a->seq > b->seq ? 1 : 0); } - int cmp (const short unsigned int *ty) const + int cmp (unsigned int ty) const { - return (type != *ty) ? (type < *ty ? -1 : 1) : 0; + return (type != ty) ? (type < ty ? -1 : 1) : 0; } }; diff --git a/src/hb-ot-map.hh b/src/hb-ot-map.hh index 8e1f5aa8..0a5a4fbc 100644 --- a/src/hb-ot-map.hh +++ b/src/hb-ot-map.hh @@ -57,8 +57,8 @@ struct hb_ot_map_t unsigned int auto_zwj : 1; unsigned int random : 1; - inline int cmp (const hb_tag_t *tag_) const - { return *tag_ < tag ? -1 : *tag_ > tag ? 1 : 0; } + inline int cmp (const hb_tag_t tag_) const + { return tag_ < tag ? -1 : tag_ > tag ? 1 : 0; } }; struct lookup_map_t { diff --git a/src/hb-set.hh b/src/hb-set.hh index cc061a7c..a464131d 100644 --- a/src/hb-set.hh +++ b/src/hb-set.hh @@ -45,7 +45,7 @@ struct hb_set_t struct page_map_t { - inline int cmp (const page_map_t *o) const { return (int) o->major - (int) major; } + inline int cmp (const page_map_t &o) const { return (int) o.major - (int) major; } uint32_t major; uint32_t index; diff --git a/src/hb-vector.hh b/src/hb-vector.hh index b30511d8..436b92bb 100644 --- a/src/hb-vector.hh +++ b/src/hb-vector.hh @@ -267,7 +267,7 @@ struct hb_vector_t while (min <= max) { int mid = ((unsigned int) min + (unsigned int) max) / 2; - int c = array[mid].cmp (&x); + int c = array[mid].cmp (x); if (c < 0) max = mid - 1; else if (c > 0) @@ -281,7 +281,7 @@ struct hb_vector_t } if (i) { - if (max < 0 || (max < (int) this->len && array[max].cmp (&x) > 0)) + if (max < 0 || (max < (int) this->len && array[max].cmp (x) > 0)) max++; *i = max; } commit 5fdf7b724eb3cb5ac60cd7f90d3250877ad7ca06 Author: Nathan Willis <nwil...@glyphography.com> Date: Thu Nov 15 17:40:21 2018 -0600 Usermanual: clusters chapter; add brief grapheme definition and clarify monotonous cluster handling. diff --git a/docs/usermanual-clusters.xml b/docs/usermanual-clusters.xml index c59818fc..f48e89c2 100644 --- a/docs/usermanual-clusters.xml +++ b/docs/usermanual-clusters.xml @@ -14,15 +14,29 @@ unit. </para> <para> - During the shaping process, some shaping operations may - merge adjacent characters (for example, when two code points form - a ligature and are replaced by a single glyph) or split one - character into several (for example, when performing the Unicode - canonical decomposition of a code point). + A cluster is distinct from a <emphasis>grapheme</emphasis>, + which is the smallest unit of a writing system or script, + because clusters are only relevant for script shaping and the + layout of glyphs. + </para> + <para> + For example, a grapheme may be a letter, a number, a logogram, + or a symbol. When two letters form a ligature, however, they + combine into a single glyph. They are therefore part of the same + cluster and are treated as a unit — even though the two + original, underlying letters are separate graphemes. + </para> + <para> + During the shaping process, there are several shaping operations + that may merge adjacent characters (for example, when two code + points form a ligature or a conjunct form and are replaced by a + single glyph) or split one character into several (for example, + when decomposing a code point through the + <literal>ccmp</literal> feature). </para> <para> HarfBuzz tracks clusters independently from how these - shaping operations alter the individual glyphs that comprise the + shaping operations affect the individual glyphs that comprise the output HarfBuzz returns in a buffer. Consequently, a client program using HarfBuzz can utilize the cluster information to implement features such as: @@ -69,15 +83,15 @@ </listitem> </itemizedlist> <para> - When you add text to a HarfBuzz buffer, each code point is assigned - a <emphasis>cluster value</emphasis>. + When you add text to a HarfBuzz buffer, each code point must be + assigned a <emphasis>cluster value</emphasis>. </para> <para> This cluster value is an arbitrary number; HarfBuzz uses it only to distinguish between clusters. Many client programs will use the index of each code point in the input text stream as the - cluster value, for the sake of convenience; the actual value does - not matter. + cluster value. This is for the sake of convenience; the actual + value does not matter. </para> <para> Client programs can choose how HarfBuzz handles clusters during @@ -100,7 +114,7 @@ as well as the <emphasis>Zero Width Joiner</emphasis> and <emphasis>Zero Width Non-Joiner</emphasis> code points, are assigned the cluster value of the closest preceding code - point from <emphasis>diferent</emphasis> category. + point from <emphasis>different</emphasis> category. </para> <para> In essence, whenever a base character is followed by a mark @@ -161,22 +175,30 @@ </listitem> </itemizedlist> <para> + As mentioned earlier, client programs using HarfBuzz often + assign initial cluster values in a buffer by reusing the indices + of the code points in the input text. This gives a sequence of + cluster values that is monotonically increasing (for example, + 0,1,2,3,4,5). + </para> + <para> It is not <emphasis>required</emphasis> that the cluster values in a buffer be monotonically increasing. However, if the initial cluster values in a buffer are monotonic and the buffer is - configured to use clustering level 0 or 1, then HarfBuzz + configured to use cluster level 0 or 1, then HarfBuzz guarantees that the final cluster values in the shaped buffer will also be monotonic. No such guarantee is made for cluster level 2. </para> <para> - In levels 0 and 1, HarfBuzz implements the following conceptual model for - cluster values: + In levels 0 and 1, HarfBuzz implements the following conceptual + model for cluster values: </para> <itemizedlist spacing="compact"> <listitem> <para> - The sequence of cluster values will always remain monotonic. + If the sequence of input cluster values is monotonic, the + sequence of cluster values will remain monotonic. </para> </listitem> <listitem> @@ -231,7 +253,7 @@ </programlisting> <para> During shaping, HarfBuzz maps these characters to glyphs from - the font. For simplicity, let's assume that each character maps + the font. For simplicity, let us assume that each character maps to the corresponding, identical-looking glyph: </para> <programlisting> @@ -297,7 +319,7 @@ 0,1,2,3,4 </programlisting> <para> - If <literal>D</literal> is reordered before <literal>B</literal>, + If <literal>D</literal> is reordered to before <literal>B</literal>, then HarfBuzz merges the <literal>B</literal>, <literal>C</literal>, and <literal>D</literal> clusters, and we get: commit 939220e57da613e090d247aa1af2396c28370af4 Author: Nathan Willis <nwil...@glyphography.com> Date: Thu Nov 15 15:47:03 2018 -0600 Usermanual: clusters chapter, minor updates. diff --git a/docs/usermanual-clusters.xml b/docs/usermanual-clusters.xml index f7db0f59..c59818fc 100644 --- a/docs/usermanual-clusters.xml +++ b/docs/usermanual-clusters.xml @@ -30,20 +30,21 @@ <itemizedlist> <listitem> <para> - Correctly positioning the cursor between two characters that - have combined into a single glyph by forming a ligature. + Correctly positioning the cursor within a shaped text run, + even when characters have formed ligatures, composed or + decomposed, reordered, or undergone other shaping operations. </para> </listitem> <listitem> <para> Correctly highlighting a text selection that includes some, - but not all, of the characters comprising a ligature. + but not all, of the characters in a word. </para> </listitem> <listitem> <para> Applying text attributes (such as color or underlining) to - part, but not all, of a composed base-and-mark combination. + part, but not all, of a word. </para> </listitem> <listitem> @@ -54,6 +55,12 @@ </listitem> <listitem> <para> + Determining the mapping between input characters and output + glyphs, such as which glyphs are ligatures. + </para> + </listitem> + <listitem> + <para> Performing line-breaking, justification, and other line-level or paragraph-level operations that must be done after shaping is complete, but which require character-level @@ -69,7 +76,7 @@ This cluster value is an arbitrary number; HarfBuzz uses it only to distinguish between clusters. Many client programs will use the index of each code point in the input text stream as the - cluster value, as a matter of convenience; the actual value does + cluster value, for the sake of convenience; the actual value does not matter. </para> <para> @@ -122,10 +129,10 @@ Level 1 differs from level 0 by not merging the clusters of marks and other modifier code points with the preceding "base" code point's cluster. By preserving the - cluster values of these marks and modifier code points, - script shaping can perform additional operations that might - lead to improved results (for example, reordering a sequence - of marks). + separate cluster values of these marks and modifier code + points, script shapers can perform additional operations + that might lead to improved results (for example, reordering + a sequence of marks). </para> <para> Client programs can specify level 1 behavior for a buffer by commit 53ac46e974cf0ee8720b40ef394714eb97ff53b9 Author: Nathan Willis <nwil...@glyphography.com> Date: Mon Nov 12 12:17:06 2018 -0600 Usermanual: expand clusters chapter. diff --git a/docs/usermanual-clusters.xml b/docs/usermanual-clusters.xml index 7b2c7adc..f7db0f59 100644 --- a/docs/usermanual-clusters.xml +++ b/docs/usermanual-clusters.xml @@ -5,306 +5,509 @@ <!ENTITY version SYSTEM "version.xml"> ]> <chapter id="clusters"> -<sect1 id="clusters"> <title>Clusters</title> - <para> - In shaping text, a <emphasis>cluster</emphasis> is a sequence of - code points that needs to be treated as a single, indivisible unit. - </para> - <para> - When you add text to a HB buffer, each character is associated with - a <emphasis>cluster value</emphasis>. This is an arbitrary number as - far as HB is concerned. - </para> - <para> - Most clients will use UTF-8, UTF-16, or UTF-32 indices, but the - actual number does not matter. Moreover, it is not required for the - cluster values to be monotonically increasing, but pretty much all - of HB's tests are performed on monotonically increasing cluster - numbers. Nevertheless, there is no such assumption in the code - itself. With that in mind, let's examine what happens with cluster - values during shaping under each cluster-level. - </para> - <para> - HarfBuzz provides three <emphasis>levels</emphasis> of clustering - support. Level 0 is the default behavior and reproduces the behavior - of the old HarfBuzz library. Level 1 tweaks this behavior slightly - to produce better results, so level 1 clustering is recommended for - code that is not required to implement backward compatibility with - the old HarfBuzz. - </para> - <para> - Level 2 differs significantly in how it treats cluster values. - Levels 0 and 1 both process ligatures and glyph decomposition by - merging clusters; level 2 does not. - </para> - <para> - The conceptual model for what the cluster values mean, in levels 0 - and 1, is this: - </para> - <itemizedlist spacing="compact"> - <listitem> - <para> - the sequence of cluster values will always remain monotone - </para> - </listitem> - <listitem> - <para> - each value represents a single cluster - </para> - </listitem> - <listitem> - <para> - each cluster contains one or more glyphs and one or more - characters - </para> - </listitem> - </itemizedlist> - <para> - Assuming that initial cluster numbers were monotonically increasing - and distinct, then all adjacent glyphs having the same cluster - number belong to the same cluster, and all characters belong to the - cluster that has the highest number not larger than their initial - cluster number. This will become clearer with an example. - </para> -</sect1> -<sect1 id="a-clustering-example-for-levels-0-and-1"> - <title>A clustering example for levels 0 and 1</title> - <para> - Let's say we start with the following character sequence and cluster - values: - </para> - <programlisting> - A,B,C,D,E - 0,1,2,3,4 -</programlisting> - <para> - We then map the characters to glyphs. For simplicity, let's assume - that each character maps to the corresponding, identical-looking - glyph: - </para> - <programlisting> - A,B,C,D,E - 0,1,2,3,4 -</programlisting> - <para> - Now if, for example, <literal>B</literal> and <literal>C</literal> - ligate, then the clusters to which they belong "merge". - This merged cluster takes for its cluster number the minimum of all - the cluster numbers of the clusters that went in. In this case, we - get: - </para> - <programlisting> - A,BC,D,E - 0,1 ,3,4 -</programlisting> - <para> - Now let's assume that the <literal>BC</literal> glyph decomposes - into three components, and <literal>D</literal> also decomposes into - two. The components each inherit the cluster value of their parent: - </para> - <programlisting> - A,BC0,BC1,BC2,D0,D1,E - 0,1 ,1 ,1 ,3 ,3 ,4 -</programlisting> - <para> - Now if <literal>BC2</literal> and <literal>D0</literal> ligate, then - their clusters (numbers 1 and 3) merge into - <literal>min(1,3) = 1</literal>: - </para> - <programlisting> - A,BC0,BC1,BC2D0,D1,E - 0,1 ,1 ,1 ,1 ,4 -</programlisting> - <para> - At this point, cluster 1 means: the character sequence - <literal>BCD</literal> is represented by glyphs - <literal>BC0,BC1,BC2D0,D1</literal> and cannot be broken down any - further. - </para> -</sect1> -<sect1 id="reordering-in-levels-0-and-1"> - <title>Reordering in levels 0 and 1</title> - <para> - Another common operation in the more complex shapers is when things - reorder. In those cases, to maintain monotone clusters, HB merges - the clusters of everything in the reordering sequence. For example, - let's again start with the character sequence: - </para> - <programlisting> - A,B,C,D,E - 0,1,2,3,4 -</programlisting> - <para> - If <literal>D</literal> is reordered before <literal>B</literal>, - then the <literal>B</literal>, <literal>C</literal>, and - <literal>D</literal> clusters merge, and we get: - </para> - <programlisting> - A,D,B,C,E - 0,1,1,1,4 -</programlisting> - <para> - This is clearly not ideal, but it is the only sensible way to - maintain monotone indices and retain the true relationship between - glyphs and characters. - </para> -</sect1> -<sect1 id="the-distinction-between-levels-0-and-1"> - <title>The distinction between levels 0 and 1</title> - <para> - So, the above is pretty much what cluster levels 0 and 1 do. The - only difference between the two is this: in level 0, at the very - beginning of the shaping process, we also merge clusters between - base characters and all Unicode marks (combining or not) following - them. E.g.: - </para> - <programlisting> - A,acute,B - 0,1 ,2 -</programlisting> - <para> - will become: - </para> - <programlisting> - A,acute,B - 0,0 ,2 -</programlisting> - <para> - This is the default behavior. We do it because Windows did it and - old HarfBuzz did it, so this remained the default. But this behavior - makes it impossible to color diacritic marks differently from their - base characters. That's why in level 1 we do not perform this - initial merging step. - </para> - <para> - For clients, level 0 is more convenient if they rely on HarfBuzz - clusters for cursor positioning. But that's wrong anyway: cursor - positions should be determined based on Unicode grapheme boundaries, - NOT shaping clusters. As such, level 1 clusters are preferred. - </para> - <para> - One last note about levels 0 and 1. We currently don't allow a - <literal>MultipleSubst</literal> lookup to replace a glyph with zero - glyphs (i.e., to delete a glyph). But in some other situations, - glyphs can be deleted. In those cases, if the glyph being deleted is - the last glyph of its cluster, we make sure to merge the cluster - with a neighboring cluster. - </para> - <para> - This is, primarily, to make sure that the starting cluster of the - text always has the cluster index pointing to the start of the text - for the run; more than one client currently relies on this - guarantee. - </para> - <para> - Incidentally, Apple's CoreText does something else to maintain the - same promise: it inserts a glyph with id 65535 at the beginning of - the glyph string if the glyph corresponding to the first character - in the run was deleted. HarfBuzz might do something similar in the - future. - </para> -</sect1> -<sect1 id="level-2"> - <title>Level 2</title> - <para> - Level 2 is a different beast from levels 0 and 1. It is simple to - describe, but hard to make sense of. It simply doesn't do any - cluster merging whatsoever. When things ligate or otherwise multiple - glyphs turn into one, the cluster value of the first glyph is - retained. - </para> - <para> - Here are a few examples of why processing cluster values produced at - this level might be tricky: - </para> - <sect2 id="ligatures-with-combining-marks"> - <title>Ligatures with combining marks</title> - <para> - Imagine capital letters are bases and lower case letters are - combining marks. With an input sequence like this: + <section id="clusters"> + <title>Clusters</title> + <para> + In text shaping, a <emphasis>cluster</emphasis> is a sequence of + characters that needs to be treated as a single, indivisible + unit. </para> - <programlisting> - A,a,B,b,C,c - 0,1,2,3,4,5 -</programlisting> <para> - if <literal>A,B,C</literal> ligate, then here are the cluster - values one would get under the various levels: + During the shaping process, some shaping operations may + merge adjacent characters (for example, when two code points form + a ligature and are replaced by a single glyph) or split one + character into several (for example, when performing the Unicode + canonical decomposition of a code point). </para> <para> - level 0: + HarfBuzz tracks clusters independently from how these + shaping operations alter the individual glyphs that comprise the + output HarfBuzz returns in a buffer. Consequently, + a client program using HarfBuzz can utilize the cluster + information to implement features such as: + </para> + <itemizedlist> + <listitem> + <para> + Correctly positioning the cursor between two characters that + have combined into a single glyph by forming a ligature. + </para> + </listitem> + <listitem> + <para> + Correctly highlighting a text selection that includes some, + but not all, of the characters comprising a ligature. + </para> + </listitem> + <listitem> + <para> + Applying text attributes (such as color or underlining) to + part, but not all, of a composed base-and-mark combination. + </para> + </listitem> + <listitem> + <para> + Generating output document formats (such as PDF) with + embedded text that can be fully extracted. + </para> + </listitem> + <listitem> + <para> + Performing line-breaking, justification, and other + line-level or paragraph-level operations that must be done + after shaping is complete, but which require character-level + properties. + </para> + </listitem> + </itemizedlist> + <para> + When you add text to a HarfBuzz buffer, each code point is assigned + a <emphasis>cluster value</emphasis>. + </para> + <para> + This cluster value is an arbitrary number; HarfBuzz uses it only + to distinguish between clusters. Many client programs will use + the index of each code point in the input text stream as the + cluster value, as a matter of convenience; the actual value does + not matter. + </para> + <para> + Client programs can choose how HarfBuzz handles clusters during + shaping by setting the + <literal>cluster_level</literal> of the + buffer. HarfBuzz offers three <emphasis>levels</emphasis> of + clustering support for this property: + </para> + <itemizedlist> + <listitem> + <para><emphasis>Level 0</emphasis> is the default and + reproduces the behavior of the old HarfBuzz library. + </para> + <para> + The distinguishing feature of level 0 behavior is that, at + the beginning of processing the buffer, all code points that + are categorized as <emphasis>marks</emphasis>, + <emphasis>modifier symbols</emphasis>, or + <emphasis>Emoji extended pictographic</emphasis> modifiers, + as well as the <emphasis>Zero Width Joiner</emphasis> and + <emphasis>Zero Width Non-Joiner</emphasis> code points, are + assigned the cluster value of the closest preceding code + point from <emphasis>diferent</emphasis> category. + </para> + <para> + In essence, whenever a base character is followed by a mark + character or a sequence of mark characters, those marks are + reassigned to the same initial cluster value as the base + character. This reassignment is referred to as + "merging" the affected clusters. This behavior is based on + the Grapheme Cluster Boundary specification in <ulink + url="https://www.unicode.org/reports/tr29/#Regex_Definitions">Unicode + Technical Report 29</ulink>. + </para> + <para> + Client programs can specify level 0 behavior for a buffer by + setting its <literal>cluster_level</literal> to + <literal>HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES</literal>. + </para> + </listitem> + <listitem> + <para> + <emphasis>Level 1</emphasis> tweaks the old behavior + slightly to produce better results. Therefore, level 1 + clustering is recommended for code that is not required to + implement backward compatibility with the old HarfBuzz. + </para> + <para> + Level 1 differs from level 0 by not merging the + clusters of marks and other modifier code points with the + preceding "base" code point's cluster. By preserving the + cluster values of these marks and modifier code points, + script shaping can perform additional operations that might + lead to improved results (for example, reordering a sequence + of marks). + </para> + <para> + Client programs can specify level 1 behavior for a buffer by + setting its <literal>cluster_level</literal> to + <literal>HB_BUFFER_CLUSTER_LEVEL_MONOTONE_CHARACTERS</literal>. + </para> + </listitem> + <listitem> + <para> + <emphasis>Level 2</emphasis> differs significantly in how it + treats cluster values. In level 2, HarfBuzz never merges + clusters. + </para> + <para> + This difference can be seen most clearly when HarfBuzz processes + ligature substitutions and glyph decompositions. In level 0 + and level 1, ligatures and glyph decomposition both involve + merging clusters; in level 2, neither of these operations + triggers a merge. + </para> + <para> + Client programs can specify level 2 behavior for a buffer by + setting its <literal>cluster_level</literal> to + <literal>HB_BUFFER_CLUSTER_LEVEL_CHARACTERS</literal>. + </para> + </listitem> + </itemizedlist> + <para> + It is not <emphasis>required</emphasis> that the cluster values + in a buffer be monotonically increasing. However, if the initial + cluster values in a buffer are monotonic and the buffer is + configured to use clustering level 0 or 1, then HarfBuzz + guarantees that the final cluster values in the shaped buffer + will also be monotonic. No such guarantee is made for cluster + level 2. + </para> + <para> + In levels 0 and 1, HarfBuzz implements the following conceptual model for + cluster values: + </para> + <itemizedlist spacing="compact"> + <listitem> + <para> + The sequence of cluster values will always remain monotonic. + </para> + </listitem> + <listitem> + <para> + Each cluster value represents a single cluster. + </para> + </listitem> + <listitem> + <para> + Each cluster contains one or more glyphs and one or more + characters. + </para> + </listitem> + </itemizedlist> + <para> + In practice, this model offers several benefits. Assuming that + the initial cluster values were monotonically increasing + and distinct before shaping began, then, in the final output: + </para> + <itemizedlist spacing="compact"> + <listitem> + <para> + All adjacent glyphs having the same final cluster + value belong to the same cluster. + </para> + </listitem> + <listitem> + <para> + Each character belongs to the cluster that has the highest + cluster value <emphasis>not larger than</emphasis> its + initial cluster value. + </para> + </listitem> + </itemizedlist> + + </section> + <section id="a-clustering-example-for-levels-0-and-1"> + <title>A clustering example for levels 0 and 1</title> + <para> + The guarantees and benefits of level 0 and level 1 can be seen + with some examples. First, let us examine what happens with cluster + values when shaping involves cluster merging with ligatures and + decomposition. + </para> + <para> + Let's say we start with the following character sequence (top row) and + initial cluster values (bottom row): </para> <programlisting> - ABC,a,b,c - 0 ,0,0,0 -</programlisting> + A,B,C,D,E + 0,1,2,3,4 + </programlisting> <para> - level 1: + During shaping, HarfBuzz maps these characters to glyphs from + the font. For simplicity, let's assume that each character maps + to the corresponding, identical-looking glyph: </para> <programlisting> - ABC,a,b,c - 0 ,0,0,5 -</programlisting> + A,B,C,D,E + 0,1,2,3,4 + </programlisting> <para> - level 2: + Now if, for example, <literal>B</literal> and <literal>C</literal> + form a ligature, then the clusters to which they belong + "merge". This merged cluster takes for its cluster + value the minimum of all the cluster values of the clusters that + went in to the ligature. In this case, we get: </para> <programlisting> - ABC,a,b,c - 0 ,1,3,5 -</programlisting> + A,BC,D,E + 0,1 ,3,4 + </programlisting> + <para> + because 1 is the minimum of the set {1,2}, which were the + cluster values of <literal>B</literal> and + <literal>C</literal>. + </para> <para> - Making sense of the last example is the hardest for a client, - because there is nothing in the cluster values to suggest that - <literal>B</literal> and <literal>C</literal> ligated with - <literal>A</literal>. + Next, let us say that the <literal>BC</literal> ligature glyph + decomposes into three components, and <literal>D</literal> also + decomposes into two components. These components each inherit the + cluster value of their parent: </para> - </sect2> - <sect2 id="reordering"> - <title>Reordering</title> + <programlisting> + A,BC0,BC1,BC2,D0,D1,E + 0,1 ,1 ,1 ,3 ,3 ,4 + </programlisting> <para> - Another tricky case is when things reorder. Under level 2: + Next, if <literal>BC2</literal> and <literal>D0</literal> form a + ligature, then their clusters (cluster values 1 and 3) merge into + <literal>min(1,3) = 1</literal>: </para> <programlisting> - A,B,C,D,E - 0,1,2,3,4 -</programlisting> + A,BC0,BC1,BC2D0,D1,E + 0,1 ,1 ,1 ,1 ,4 + </programlisting> + <para> + At this point, cluster 1 means: the character sequence + <literal>BCD</literal> is represented by glyphs + <literal>BC0,BC1,BC2D0,D1</literal> and cannot be broken down any + further. + </para> + </section> + <section id="reordering-in-levels-0-and-1"> + <title>Reordering in levels 0 and 1</title> + <para> + Another common operation in the more complex shapers is glyph + reordering. In order to maintain a monotonic cluster sequence + when glyph reordering takes place, HarfBuzz merges the clusters + of everything in the reordering sequence. + </para> <para> - Now imagine <literal>D</literal> moves before - <literal>B</literal>: + For example, let us again start with the character sequence (top + row) and initial cluster values (bottom row): </para> <programlisting> - A,D,B,C,E - 0,3,1,2,4 -</programlisting> + A,B,C,D,E + 0,1,2,3,4 + </programlisting> <para> - Now, if <literal>D</literal> ligates with <literal>B</literal>, we + If <literal>D</literal> is reordered before <literal>B</literal>, + then HarfBuzz merges the <literal>B</literal>, + <literal>C</literal>, and <literal>D</literal> clusters, and we get: </para> <programlisting> - A,DB,C,E - 0,3 ,2,4 -</programlisting> + A,D,B,C,E + 0,1,1,1,4 + </programlisting> <para> - In a different scenario, <literal>A</literal> and - <literal>B</literal> could have ligated - <emphasis>before</emphasis> <literal>D</literal> reordered; that - would have resulted in: + This is clearly not ideal, but it is the only sensible way to + maintain a monotonic sequence of cluster values and retain the + true relationship between glyphs and characters. + </para> + </section> + <section id="the-distinction-between-levels-0-and-1"> + <title>The distinction between levels 0 and 1</title> + <para> + The preceding examples demonstrate the main effects of using + cluster levels 0 and 1. The only difference between the two + levels is this: in level 0, at the very beginning of the shaping + process, HarfBuzz also merges clusters between any base character + and all Unicode marks (combining or not) that follow it. + </para> + <para> + For example, let us start with the following character sequence + (top row) and accompanying initial cluster values (bottom row): + </para> + <programlisting> + A,acute,B + 0,1 ,2 + </programlisting> + <para> + The <literal>acute</literal> is a Unicode mark. If HarfBuzz is + using cluster level 0 on this sequence, then the + <literal>A</literal> and <literal>acute</literal> clusters will + merge, and the result will become: </para> <programlisting> - AB,D,C,E - 0 ,3,2,4 -</programlisting> + A,acute,B + 0,0 ,2 + </programlisting> + <para> + This initial cluster merging is the default behavior of the + Windows shaping engine, and the old HarfBuzz codebase copied + that behavior to maintain compatibility. Consequently, it has + remained the default behavior in the new HarfBuzz codebase. + </para> + <para> + But this initial cluster-merging behavior makes it impossible to + color diacritic marks differently from their base + characters. That is why, in level 1, HarfBuzz does not perform + the initial merging step. + </para> + <para> + For client programs that rely on HarfBuzz cluster values to + perform cursor positioning, level 0 is more convenient. But + relying on cluster boundaries for cursor positioning is wrong: cursor + positions should be determined based on Unicode grapheme + boundaries, not on shaping-cluster boundaries. As such, level 1 + clusters are preferred. + </para> + <para> + One last note about levels 0 and 1. HarfBuzz currently does not allow a + <literal>MultipleSubst</literal> lookup to replace a glyph with zero + glyphs (in other words, to delete a glyph). But, in some other situations, + glyphs can be deleted. In those cases, if the glyph being deleted is + the last glyph of its cluster, HarfBuzz makes sure to merge the cluster + with a neighboring cluster. + </para> + <para> + This is done primarily to make sure that the starting cluster of the + text always has the cluster index pointing to the start of the text + for the run; more than one client currently relies on this + guarantee. + </para> + <para> + Incidentally, Apple's CoreText does something else to maintain the + same promise: it inserts a glyph with id 65535 at the beginning of + the glyph string if the glyph corresponding to the first character + in the run was deleted. HarfBuzz might do something similar in the + future. + </para> + </section> + <section id="level-2"> + <title>Level 2</title> + <para> + HarfBuzz's level 2 cluster behavior uses a significantly + different model than that of level 0 and level 1. + </para> <para> - There's no way to differentiate between these two scenarios based - on the cluster numbers alone. + The level 2 behavior is easy to describe, but it may be + difficult to understand in practical terms. In brief, level 2 + performs no merging of clusters whatsoever. </para> <para> - Another problem happens with ligatures under level 2 if the - direction of the text is forced to opposite of its natural - direction (e.g. left-to-right Arabic). But that's too much of a - corner case to worry about. + When glyphs form a ligature (or when some other feature + substitutes multiple glyphs with one glyph), the cluster value + of the first glyph is retained as the cluster value for the + ligature. However, no subsequent clusters — including + marks and modifiers — are affected. </para> - </sect2> -</sect1> + <para> + Level 2 cluster behavior is less complex than level 0 or level + 1, but there are a few cases in which processing cluster values + produced at level 2 may be tricky. + </para> + <section id="ligatures-with-combining-marks-in-level-2"> + <title>Ligatures with combining marks in level 2</title> + <para> + The first example of how HarfBuzz's level 2 cluster behavior + can be tricky is when the text to be shaped includes combining + marks attached to ligatures. + </para> + <para> + Let us start with an input sequence with the following + characters (top row) and initial cluster values (bottom row): + </para> + <programlisting> + A,acute,B,breve,C,circumflex + 0,1 ,2,3 ,4,5 + </programlisting> + <para> + If the sequence <literal>A,B,C</literal> forms a ligature, + then these are the cluster values HarfBuzz will return under + the various cluster levels: + </para> + <para> + Level 0: + </para> + <programlisting> + ABC,acute,breve,circumflex + 0 ,0 ,0 ,0 + </programlisting> + <para> + Level 1: + </para> + <programlisting> + ABC,acute,breve,circumflex + 0 ,0 ,0 ,5 + </programlisting> + <para> + Level 2: + </para> + <programlisting> + ABC,acute,breve,circumflex + 0 ,1 ,3 ,5 + </programlisting> + <para> + Making sense of the level 2 result is the hardest for a client + program, because there is nothing in the cluster values that + indicates that <literal>B</literal> and <literal>C</literal> + formed a ligature with <literal>A</literal>. + </para> + <para> + In contrast, the "merged" cluster values of the mark glyphs + that are seen in the level 0 and level 1 output are evidence + that a ligature substitution took place. + </para> + </section> + <section id="reordering-in-level-2"> + <title>Reordering in level 2</title> + <para> + Another example of how HarfBuzz's level 2 cluster behavior + can be tricky is when glyphs reorder. Consider an input sequence + with the following characters (top row) and initial cluster + values (bottom row): + </para> + <programlisting> + A,B,C,D,E + 0,1,2,3,4 + </programlisting> + <para> + Now imagine <literal>D</literal> moves before + <literal>B</literal> in a reordering operation. The cluster + values will then be: + </para> + <programlisting> + A,D,B,C,E + 0,3,1,2,4 + </programlisting> + <para> + Next, if <literal>D</literal> forms a ligature with + <literal>B</literal>, the output is: + </para> + <programlisting> + A,DB,C,E + 0,3 ,2,4 + </programlisting> + <para> + However, in a different scenario, in which the shaping rules + of the script instead caused <literal>A</literal> and + <literal>B</literal> to form a ligature + <emphasis>before</emphasis> the <literal>D</literal> reordered, the + result would be: + </para> + <programlisting> + AB,D,C,E + 0 ,3,2,4 + </programlisting> + <para> + There is no way for a client program to differentiate between + these two scenarios based on the cluster values + alone. Consequently, client programs that use level 2 might + need to undertake additional work in order to manage cursor + positioning, text attributes, or other desired features. + </para> + </section> + <section id="other-considerations-in-level-2"> + <title>Other considerations in level 2</title> + <para> + There may be other problems encountered with ligatures under + level 2, such as if the direction of the text is forced to + opposite of its natural direction (for example, left-to-right + Arabic). But, generally speaking, these other scenarios are + minor corner cases that are too obscure for most client + programs to need to worry about. + </para> + </section> + </section> </chapter> _______________________________________________ HarfBuzz mailing list HarfBuzz@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/harfbuzz