[HarfBuzz] harfbuzz-ng: Branch 'master' - 2 commits

2012-07-24 Thread Behdad Esfahbod
 src/hb-unicode-private.hh |   28 
 1 file changed, 16 insertions(+), 12 deletions(-)

New commits:
commit 478fd0529b868b22905a9dedf331ac7cc9721723
Author: Behdad Esfahbod 
Date:   Tue Jul 24 17:09:01 2012 -0400

Minor

diff --git a/src/hb-unicode-private.hh b/src/hb-unicode-private.hh
index fd33387..0ba2fcc 100644
--- a/src/hb-unicode-private.hh
+++ b/src/hb-unicode-private.hh
@@ -111,9 +111,10 @@ _hb_unicode_modified_combining_class (hb_unicode_funcs_t 
*ufuncs,
 static inline hb_bool_t
 _hb_unicode_is_variation_selector (hb_codepoint_t unicode)
 {
-  return unlikely ((unicode >=  0x180B && unicode <=  0x180D) || /* MONGOLIAN 
FREE VARIATION SELECTOR ONE..THREE */
-  (unicode >=  0xFE00 && unicode <=  0xFE0F) || /* VARIATION 
SELECTOR-1..16 */
-  (unicode >= 0xE0100 && unicode <= 0xE01EF));  /* VARIATION 
SELECTOR-17..256 */
+  return unlikely (hb_in_ranges (unicode,
+0x180B, 0x180D, /* MONGOLIAN 
FREE VARIATION SELECTOR ONE..THREE */
+0xFE00, 0xFE0F, /* VARIATION 
SELECTOR-1..16 */
+0xE0100, 0xE01EF));  /* 
VARIATION SELECTOR-17..256 */
 }
 
 /* Zero-Width invisible characters:
@@ -147,16 +148,16 @@ _hb_unicode_is_variation_selector (hb_codepoint_t unicode)
 static inline hb_bool_t
 _hb_unicode_is_zero_width (hb_codepoint_t ch)
 {
-  return ((ch & ~0x007F) == 0x2000 && (
- (ch >= 0x200B && ch <= 0x200F) ||
- (ch >= 0x202A && ch <= 0x202E) ||
- (ch >= 0x2060 && ch <= 0x2063) ||
- (ch == 0x2028)
-)) || unlikely (ch == 0x0009
- || ch == 0x00AD
- || ch == 0x034F
- || ch == 0x180E
- || ch == 0xFEFF);
+  return ((ch & ~0x007F) == 0x2000 && (hb_in_ranges (ch,
+0x200B, 
0x200F,
+0x202A, 
0x202E,
+0x2060, 
0x2063) ||
+  (ch == 0x2028))) ||
+ unlikely (ch == 0x0009 ||
+   ch == 0x00AD ||
+   ch == 0x034F ||
+   ch == 0x180E ||
+   ch == 0xFEFF);
 }
 
 #endif /* HB_UNICODE_PRIVATE_HH */
commit 8979a7f6f2b44ade4c0198a31ae08561b35ce009
Author: Behdad Esfahbod 
Date:   Tue Jul 24 17:03:55 2012 -0400

[Mongolian] Remove Mongolian Vowel Separator at the end of shaping

Results match Uniscribe now.

diff --git a/src/hb-unicode-private.hh b/src/hb-unicode-private.hh
index c781035..fd33387 100644
--- a/src/hb-unicode-private.hh
+++ b/src/hb-unicode-private.hh
@@ -121,6 +121,8 @@ _hb_unicode_is_variation_selector (hb_codepoint_t unicode)
  *  00AD  SOFT HYPHEN
  *  034F  COMBINING GRAPHEME JOINER
  *
+ *  180E  MONGOLIAN VOWEL SEPARATOR
+ *
  *  200B  ZERO WIDTH SPACE
  *  200C  ZERO WIDTH NON-JOINER
  *  200D  ZERO WIDTH JOINER
@@ -153,6 +155,7 @@ _hb_unicode_is_zero_width (hb_codepoint_t ch)
 )) || unlikely (ch == 0x0009
  || ch == 0x00AD
  || ch == 0x034F
+ || ch == 0x180E
  || ch == 0xFEFF);
 }
 
___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


[HarfBuzz] harfbuzz-ng: Branch 'master' - 2 commits

2012-07-24 Thread Behdad Esfahbod
 configure.ac |   12 +
 src/Makefile.am  |7 
 src/hb-coretext-private.hh   |   42 
 src/hb-coretext.cc   |  323 +++
 src/hb-coretext.h|   43 
 src/hb-open-file-private.hh  |8 
 src/hb-ot-head-table.hh  |2 
 src/hb-ot-hhea-table.hh  |2 
 src/hb-ot-hmtx-table.hh  |2 
 src/hb-ot-layout-common-private.hh   |   14 -
 src/hb-ot-layout-gdef-table.hh   |   20 +-
 src/hb-ot-layout-gpos-table.hh   |   48 ++---
 src/hb-ot-layout-gsub-table.hh   |   30 +--
 src/hb-ot-layout-gsubgpos-private.hh |   32 +--
 src/hb-ot-maxp-table.hh  |2 
 src/hb-ot-name-table.hh  |2 
 src/hb-shape.cc  |6 
 17 files changed, 515 insertions(+), 80 deletions(-)

New commits:
commit aa6d849838d5231465ae1a25a4dd5ea1e9380ff9
Author: Jonathan Kew 
Date:   Tue Jul 24 15:52:32 2012 -0400

[CoreText] Add basic Core Text backend for comparison with our native 
shaping

Does not attempt to handle clusters in a Uniscribe- or HarfBuzz-compatible 
way;
just returns the original string indexes that CT maintains. These may even 
be
out-of-order in the case of reordrant glyphs.

diff --git a/configure.ac b/configure.ac
index cbabf83..030b04a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -186,6 +186,18 @@ AM_CONDITIONAL(HAVE_UNISCRIBE, $have_uniscribe)
 
 dnl ===
 
+AC_CHECK_HEADERS(ApplicationServices/ApplicationServices.h, 
have_coretext=true, have_coretext=false)
+if $have_coretext; then
+   CORETEXT_CFLAGS=
+   CORETEXT_LIBS=
+   AC_SUBST(CORETEXT_CFLAGS)
+   AC_SUBST(CORETEXT_LIBS)
+   AC_DEFINE(HAVE_CORETEXT, 1, [Have Core Text backend])
+fi
+AM_CONDITIONAL(HAVE_CORETEXT, $have_coretext)
+
+dnl ===
+
 AC_CACHE_CHECK([for Intel atomic primitives], 
hb_cv_have_intel_atomic_primitives, [
hb_cv_have_intel_atomic_primitives=false
AC_TRY_LINK([
diff --git a/src/Makefile.am b/src/Makefile.am
index 9fd135a..f2fce6e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -138,6 +138,13 @@ HBSOURCES += hb-uniscribe.cc hb-uniscribe-private.hh
 HBHEADERS += hb-uniscribe.h
 endif
 
+if HAVE_CORETEXT
+HBCFLAGS += $(CORETEXT_CFLAGS)
+HBLIBS   += $(CORETEXT_LIBS)
+HBSOURCES += hb-coretext.cc hb-coretext-private.hh
+HBHEADERS += hb-coretext.h
+endif
+
 # Use a C linker, not C++; Don't link to libstdc++
 libharfbuzz_la_LINK = $(LINK) $(libharfbuzz_la_LDFLAGS)
 libharfbuzz_la_SOURCES = $(HBSOURCES) $(HBHEADERS)
diff --git a/src/hb-coretext-private.hh b/src/hb-coretext-private.hh
new file mode 100644
index 000..153106c
--- /dev/null
+++ b/src/hb-coretext-private.hh
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2012  Mozilla Foundation.
+ *
+ *  This is part of HarfBuzz, a text shaping library.
+ *
+ * Permission is hereby granted, without written agreement and without
+ * license or royalty fees, to use, copy, modify, and distribute this
+ * software and its documentation for any purpose, provided that the
+ * above copyright notice and the following two paragraphs appear in
+ * all copies of this software.
+ *
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
+ * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
+ * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
+ * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
+ * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+ *
+ * Mozilla Author(s): Jonathan Kew
+ */
+
+#ifndef HB_CORETEXT_PRIVATE_HH
+#define HB_CORETEXT_PRIVATE_HH
+
+#include "hb-private.hh"
+
+#include "hb-coretext.h"
+
+
+HB_INTERNAL hb_bool_t
+_hb_coretext_shape (hb_font_t  *font,
+hb_buffer_t*buffer,
+const hb_feature_t *features,
+unsigned intnum_features);
+
+
+#endif /* HB_CORETEXT_PRIVATE_HH */
diff --git a/src/hb-coretext.cc b/src/hb-coretext.cc
new file mode 100644
index 000..f49e76e
--- /dev/null
+++ b/src/hb-coretext.cc
@@ -0,0 +1,323 @@
+/*
+ * Copyright © 2012  Mozilla Foundation.
+ *
+ *  This is part of HarfBuzz, a text shaping library.
+ *
+ * Permission is hereby granted, without written agreement and without
+ * license or royalty fees, to use, copy, modify, and distribute this
+ * software and its documentation for any purpose, provided that the
+ * above copyright notice and the following two paragraphs appear

[HarfBuzz] harfbuzz-ng: Branch 'master'

2012-07-24 Thread Behdad Esfahbod
 src/hb-shape.cc |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

New commits:
commit 97aa0b738a33b73a3f9763dd2950f2dd39f596ed
Author: Behdad Esfahbod 
Date:   Tue Jul 24 15:02:34 2012 -0400

Minor const correctness shuffling

diff --git a/src/hb-shape.cc b/src/hb-shape.cc
index 56c9046..60a2dce 100644
--- a/src/hb-shape.cc
+++ b/src/hb-shape.cc
@@ -68,13 +68,13 @@ static const struct hb_shaper_pair_t {
 
 /* Thread-safe, lock-free, shapers */
 
-static hb_shaper_pair_t *static_shapers;
+static const hb_shaper_pair_t *static_shapers;
 
 static
 void free_static_shapers (void)
 {
   if (unlikely (static_shapers != all_shapers))
-free (static_shapers);
+free ((void *) static_shapers);
 }
 
 static const hb_shaper_pair_t *
@@ -87,7 +87,7 @@ retry:
   {
 char *env = getenv ("HB_SHAPER_LIST");
 if (!env || !*env) {
-  (void) hb_atomic_ptr_cmpexch (&static_shapers, NULL, (const 
hb_shaper_pair_t *) all_shapers);
+  (void) hb_atomic_ptr_cmpexch (&static_shapers, NULL, &all_shapers[0]);
   return (const hb_shaper_pair_t *) all_shapers;
 }
 
___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


[HarfBuzz] harfbuzz-ng: Branch 'master'

2012-07-24 Thread Behdad Esfahbod
 src/hb-ot-shape-complex-indic.cc|  
  2 +-
 test/shaping/texts/in-tree/shaper-indic/indic/script-gurmukhi/misc/misc.txt |  
  1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

New commits:
commit 6411e74caf23af7b0545f1fe54d19a1c8da895e8
Author: Behdad Esfahbod 
Date:   Tue Jul 24 13:48:49 2012 -0400

[Indic] Reposition Gurmukhi top matras to after post

The font is forming a post-base consonant in some samples, and Uniscribe
positions top matra on the post-base.  Do the same.

Gurmukhi failures down from 59 to 41 (0.0674242%).

diff --git a/src/hb-ot-shape-complex-indic.cc b/src/hb-ot-shape-complex-indic.cc
index d0c3c09..ea5648a 100644
--- a/src/hb-ot-shape-complex-indic.cc
+++ b/src/hb-ot-shape-complex-indic.cc
@@ -151,7 +151,7 @@ consonant_position (hb_codepoint_t u, hb_ot_map_t *map, 
hb_font_t *font)
)
 #define MATRA_POS_TOP(u)   ( /* BENG and MLYM don't have top matras. */ \
  IS_DEVA(u) ? POS_AFTER_SUB  : \
- IS_GURM(u) ? POS_AFTER_SUB  : \
+ IS_GURM(u) ? POS_AFTER_POST  : /* Deviate 
from spec */ \
  IS_GUJA(u) ? POS_AFTER_SUB  : \
  IS_ORYA(u) ? POS_AFTER_MAIN : \
  IS_TAML(u) ? POS_AFTER_SUB  : \
diff --git 
a/test/shaping/texts/in-tree/shaper-indic/indic/script-gurmukhi/misc/misc.txt 
b/test/shaping/texts/in-tree/shaper-indic/indic/script-gurmukhi/misc/misc.txt
index bbc6646..27a39f6 100644
--- 
a/test/shaping/texts/in-tree/shaper-indic/indic/script-gurmukhi/misc/misc.txt
+++ 
b/test/shaping/texts/in-tree/shaper-indic/indic/script-gurmukhi/misc/misc.txt
@@ -1 +1,2 @@
 ਕ੍ਹ
+ਤ੍ਯੋ
___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


[HarfBuzz] harfbuzz-ng: Branch 'master'

2012-07-24 Thread Behdad Esfahbod
 test/shaping/hb_test_tools.py| 
   1 +
 test/shaping/texts/in-tree/shaper-indic/indic/script-malayalam/misc/misc.txt | 
   2 ++
 2 files changed, 3 insertions(+)

New commits:
commit c3f769ba09df319fa69d04f68c57444f95eceee6
Author: Behdad Esfahbod 
Date:   Tue Jul 24 13:26:32 2012 -0400

[Indic] Ignore Uniscribe output containing two zero-width space glyphs

Uniscribe is buggy and sometimes /eats/ a mark next to a non-joiner.
Most of Malayalam failures where actually hitting this bug.

Ignore test output with two zero-width space glyphs.  This is a hack
until we build up the test suite infrastructure better.

Bengali went down by 9, Devanagari by 2, Kannada by 130, Malayalm down
from 1197 to 307, Sinhala down by 16, Telugu down by 26.  New stats:

BENGALI: 353996 out of 354285 tests passed. 289 failed (0.0815727%)
DEVANAGARI: 693573 out of 693628 tests passed. 55 failed (0.00792932%)
GUJARATI: 366489 out of 366506 tests passed. 17 failed (0.0046384%)
GURMUKHI: 60750 out of 60809 tests passed. 59 failed (0.0970251%)
KANNADA: 951086 out of 951913 tests passed. 827 failed (0.0868777%)
KHMER: 299094 out of 299124 tests passed. 30 failed (0.0100293%)
MALAYALAM: 1048109 out of 1048416 tests passed. 307 failed (0.0292823%)
ORIYA: 42320 out of 42329 tests passed. 9 failed (0.021262%)
SINHALA: 271715 out of 271847 tests passed. 132 failed (0.0485567%)
TAMIL: 1091837 out of 1091837 tests passed. 0 failed (0%)
TELUGU: 970550 out of 970573 tests passed. 23 failed (0.00236973%)

diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index a62f9c9..ce46588 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -295,6 +295,7 @@ class DiffHelpers:
def test_passed (lines):
lines = list (lines)
# XXX This is a hack, but does the job for now.
+   if any (l.find("space|space") >= 0 for l in lines): return True
if any (l.find("uni25CC") >= 0 for l in lines): return True
if any (l.find("dottedcircle") >= 0 for l in lines): return True
return all (l[0] == ' ' for l in lines)
diff --git 
a/test/shaping/texts/in-tree/shaper-indic/indic/script-malayalam/misc/misc.txt 
b/test/shaping/texts/in-tree/shaper-indic/indic/script-malayalam/misc/misc.txt
index ffb408d..78fdeb8 100644
--- 
a/test/shaping/texts/in-tree/shaper-indic/indic/script-malayalam/misc/misc.txt
+++ 
b/test/shaping/texts/in-tree/shaper-indic/indic/script-malayalam/misc/misc.txt
@@ -59,3 +59,5 @@
 ള്യം
 ള്ള
 ല്‍പ്പേ
+ശിം‌
+കോം‌
___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


Re: [HarfBuzz] dotted circle is not appearing for dependant vowel

2012-07-24 Thread Behdad Esfahbod
On 07/24/2012 11:36 AM, Harshula wrote:
> 4) So, how do we move forward? Can we suppress the dotted circle by
> including some special Unicode codepoint before the dependent vowel? I
> want the dotted circle to be the default and have an option to suppress
> it when required.

You can use a base of NBSP.  We support that already.

behdad
___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


Re: [HarfBuzz] dotted circle is not appearing for dependant vowel

2012-07-24 Thread Jonathan Kew

On 24/7/12 16:05, Shriramana Sharma wrote:

On Tue, Jul 24, 2012 at 6:48 PM, Jonathan Kew  wrote:

In general, I think the Indic shaper should *not* insert dotted circles. The
one exception that I think may be desirable would be the case of
left-reordrant matras when no usable base character


Hi Jonathan -- while I agree in principle that security issues should
not be the responsibility of the shaping engine and support the idea
of allowing a meaningful rendering for अिुा, I wonder why you support
the dotted circle concept for the above case alone?



In cases such as repeated copies of a combining mark, it's reasonable to 
leave it up to the font to handle positioning such that all the marks 
can be seen - e.g. by stacking rather than overprinting successive copies.


However, because the reordering of <0915, 093F> to  is done 
by the shaping engine before application of OpenType features/lookups, I 
don't see any way for a font designer to make the two combinations 
render differently. After the initial reordering by the shaping engine, 
both <0915, 093F> and <093F, 0915> will yield exactly the same sequence 
of glyphs for OpenType layout features to shape, unless the shaper takes 
specific action such as inserting a dotted circle as 'base' character.


I suppose the same could be considered to apply when there is a 
combining mark at the beginning of a text sequence to be shaped; it has 
no base to which it can apply (i.e. we're dealing with a defective 
combining character sequence, in Unicode terms).


I note that the Unicode standard says that:

"With isolated combining characters or when a process is unable to 
perform graphical combination, a process may present a combining 
character without graphical combination; that is, it may present it as 
if it were a base character."


This could apply to the <093F, 0915> sequence, for example: U+093F is a 
combining character, but it has no base with which to combine. So 
perhaps we should "present it as if it were a base" by applying it to a 
_space_ glyph (rather than a dotted circle). This would result in a 
clear visual distinction between <093F, 0915> (you'd see i-matra applied 
to a space, so it'd be separated from the following ka) and <0915, 
093F>, without arbitrarily introducing U+25CC into the sequence.


JK

___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


Re: [HarfBuzz] dotted circle is not appearing for dependant vowel

2012-07-24 Thread Shriramana Sharma
On Tue, Jul 24, 2012 at 9:06 PM, Harshula  wrote:
> 4) So, how do we move forward? Can we suppress the dotted circle by
> including some special Unicode codepoint before the dependent vowel? I
> want the dotted circle to be the default and have an option to suppress
> it when required.

IIUC some systems (Pango?) allow ZWJ to suppress dotted circles caused
due to unexpected sequences. But should using ZWJ in "meaningless"
sequences (such as vowel signs applied to numbers) also mean
reordering should be done? २ि for instance?

-- 
Shriramana Sharma
___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


Re: [HarfBuzz] dotted circle is not appearing for dependant vowel

2012-07-24 Thread Harshula
On Tue, 2012-07-24 at 14:18 +0100, Jonathan Kew wrote:

> In general, I think the Indic shaper should *not* insert dotted circles. 
> The one exception that I think may be desirable would be the case of 
> left-reordrant matras when no usable base character (either consonant or 
> vowel letter, or other "placeholder" such as an explicit U+25cc or a 
> space, no-break space, etc) can be found. In this case inserting a 
> dotted circle (or a space?) to act as the base, and then reordering the 
> matra to the left of it, may be the best option, so that a "visually 
> encoded" sequence िक does not appear identical to the correctly-encoded कि.

1) You've identified the need for the inclusion and exclusion of the
dotted circle.

2) The dotted circle a great way to easily identify encoding errors,
generated by broken input-methods/OSs, in scripts/languages that I can
comprehend.

3) There are circumstances where the dotted circle is not desirable. For
example, when explaining orthography rules, the dotted circle is an
absolute visual nuisance. IIRC, Pango used to get the dotted circle
glyph from any font, however, ICU would try to get it from the font
being used. Hence with ICU, you could use a font without the dotted
circle glyph to avoid it in presentations.

4) So, how do we move forward? Can we suppress the dotted circle by
including some special Unicode codepoint before the dependent vowel? I
want the dotted circle to be the default and have an option to suppress
it when required.

cya,
#

___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


Re: [HarfBuzz] dotted circle is not appearing for dependant vowel

2012-07-24 Thread Shriramana Sharma
On Tue, Jul 24, 2012 at 6:48 PM, Jonathan Kew  wrote:
> In general, I think the Indic shaper should *not* insert dotted circles. The
> one exception that I think may be desirable would be the case of
> left-reordrant matras when no usable base character

Hi Jonathan -- while I agree in principle that security issues should
not be the responsibility of the shaping engine and support the idea
of allowing a meaningful rendering for अिुा, I wonder why you support
the dotted circle concept for the above case alone?

-- 
Shriramana Sharma
___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


Re: [HarfBuzz] dotted circle is not appearing for dependant vowel

2012-07-24 Thread Jonathan Kew

On 24/7/12 12:51, Shriramana Sharma wrote:

On Tue, Jul 24, 2012 at 3:26 PM, Pravin Satpute  wrote:


I see the dotted circle is still not appearing with dependant vowels
(U+093f), Is this intentionally?
Might be since you are removing test cases generating dotted circle
in Uniscribe before running it with harfbuzz.


May I take this opportunity to record what I have long felt on the
topic of dotted circles.

I feel that dotted circles should not be displayed except when not
doing so can cause non-canonically-equivalent encoded sequences to
appear the same. That is, they should be displayed only to distinguish
between such sequences. (This is to protect against phishing and
such.)


I don't think phishing protection is the responsibility of a shaping 
engine. There are far too many completely legitimate sequences (in both 
"complex" and "simple" scripts) that can be visually confusable.



For example, the long vowel आ does not have a decomposition to अ+ ा
whereas it would appear the same as the latter if there is no dotted
circle. There are many such "do not use" recommendations for
independent vowels in the Indic Unicode chapters because of the
absences of canonical equivalences (unfortunate IMO but well).


Software designed for phishing protection might indeed want to guard 
against such sequences (among many other things); however, I don't think 
this is the shaping engine's job.



Reordrant vowels like ि are also likewise, because in the case of a
sequence अिक mistakenly typed (or maliciously introduced) for अकि if
there is no dotted circle the two sequences would appear the same


This isn't a particularly good example. In my email client, neither of 
them shows a dotted circle, but neither do they look the same. The first 
one displays the i-matra to the left of the full a-vowel; the second 
displays it between the a-vowel and the ka. This seems like a perfectly 
reasonable way to render the two sequences. If there are use cases (as 
has already been mentioned) for multiple vowel matras on a single base 
consonant, why shouldn't there also be use cases for vowel matras placed 
on a full vowel letter as their base?


A pair that could be more problematic would be कि / िक (0915,093F / 
093F,0915). These do display identically here where I'm typing (although 
many systems doubtless insert a dotted circle in the second case).



which is not appropriate from a security viewpoint as they are not
canonically equivalent.

My point is, there may be many reasons for unexpected combinations of
characters in Indic. Vedic texts is one. Minority orthographies is
(which may use rare combinations of vowel signs and diacritics)
another. Legitimate creative use (like काा) for "k" (a shout)
is yet another. Imposing a limited orthography (i.e. only recognizing
a certain set of patterns of sequences and producing dotted circles
for sequences that do not fit the pattern) would preclude the
usefulness of the rendering system to users of such cases.

Of course, this usability can also be achieved by first imposing a
generic orthography (i.e. script grammar) and later adding more
recognized sequences as per user community request. (This is also much
easier to produce and deliver to the community in open source
ecosystems than in proprietary ones.)

This would be advisable since it may be difficult to predict which
sequences in Indic would be confusable, especially with non-spacing
marks. For example, तु and तुु would be confusable if there is no
dotted circle and the second ु is overlaid upon the first.


A careful font designer can address examples like this by providing 
mark-to-mark positioning rules that will make multiple copies of the 
same mark "stack" rather than simply overprint each other.


Of course, not every font designer will be so careful. But then, not 
every Latin-script font adequately distinguishes 'I', 'l', and '1', 
either. We can't expect shaping engines to somehow make up for visual 
ambiguities in font designs.




But these sequences are not self-obvious, so it appears creating
regexs for sequences where dotted circles should *not* be produced
might be easier than to do so where they *should* be produced and it
would be appropriate to err on the side of caution.


IMO, "to err on the side of caution" in the matter of dotted-circle 
insertion means that we should avoid the risk of blocking a use case 
that someone might someday want, even if we can't anticipate that 
particular need. So, for example, even though we may not be aware of any 
current need for a sequence such as "अिुा", there's no compelling reason 
for a shaping engine to insert dotted circles into it and thus make it 
impossible for a user to encode and render an a-vowel with these three 
matras placed around it.


In general, I think the Indic shaper should *not* insert dotted circles. 
The one exception that I think may be desirable would be the case of 
left-reordrant matras when no usable base character (ei

Re: [HarfBuzz] dotted circle is not appearing for dependant vowel

2012-07-24 Thread Shriramana Sharma
On Tue, Jul 24, 2012 at 3:26 PM, Pravin Satpute  wrote:
>
>I see the dotted circle is still not appearing with dependant vowels
> (U+093f), Is this intentionally?
>Might be since you are removing test cases generating dotted circle
> in Uniscribe before running it with harfbuzz.

May I take this opportunity to record what I have long felt on the
topic of dotted circles.

I feel that dotted circles should not be displayed except when not
doing so can cause non-canonically-equivalent encoded sequences to
appear the same. That is, they should be displayed only to distinguish
between such sequences. (This is to protect against phishing and
such.)

For example, the long vowel आ does not have a decomposition to अ+ ा
whereas it would appear the same as the latter if there is no dotted
circle. There are many such "do not use" recommendations for
independent vowels in the Indic Unicode chapters because of the
absences of canonical equivalences (unfortunate IMO but well).
Reordrant vowels like ि are also likewise, because in the case of a
sequence अिक mistakenly typed (or maliciously introduced) for अकि if
there is no dotted circle the two sequences would appear the same
which is not appropriate from a security viewpoint as they are not
canonically equivalent.

My point is, there may be many reasons for unexpected combinations of
characters in Indic. Vedic texts is one. Minority orthographies is
(which may use rare combinations of vowel signs and diacritics)
another. Legitimate creative use (like काा) for "k" (a shout)
is yet another. Imposing a limited orthography (i.e. only recognizing
a certain set of patterns of sequences and producing dotted circles
for sequences that do not fit the pattern) would preclude the
usefulness of the rendering system to users of such cases.

Of course, this usability can also be achieved by first imposing a
generic orthography (i.e. script grammar) and later adding more
recognized sequences as per user community request. (This is also much
easier to produce and deliver to the community in open source
ecosystems than in proprietary ones.)

This would be advisable since it may be difficult to predict which
sequences in Indic would be confusable, especially with non-spacing
marks. For example, तु and तुु would be confusable if there is no
dotted circle and the second ु is overlaid upon the first.

But these sequences are not self-obvious, so it appears creating
regexs for sequences where dotted circles should *not* be produced
might be easier than to do so where they *should* be produced and it
would be appropriate to err on the side of caution.

I had to say this, being a scholar of Sanskrit and Vedic, which really
puts scripts (and hence software support for them) to their limit.
Pravin (OP on this thread) and I, we have plans for developing a Lohit
Devanagari Vedic font, so we'll be coming back on this...

-- 
Shriramana Sharma
___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


[HarfBuzz] dotted circle is not appearing for dependant vowel

2012-07-24 Thread Pravin Satpute
Hi Behdad,

   I see the dotted circle is still not appearing with dependant vowels
(U+093f), Is this intentionally?
   Might be since you are removing test cases generating dotted circle
in Uniscribe before running it with harfbuzz.

Regards,
Pravin Satpute
___
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz


[HarfBuzz] harfbuzz-ng: Branch 'master' - 21 commits

2012-07-24 Thread Behdad Esfahbod
 src/hb-ot-layout-gsubgpos-private.hh   
 |   27 -
 src/hb-ot-layout-private.hh
 |   35 ++
 src/hb-ot-shape-complex-indic-machine.rl   
 |6 
 src/hb-ot-shape-complex-indic-private.hh   
 |2 
 src/hb-ot-shape-complex-indic.cc   
 |  157 +++---
 src/hb-ot-shape.cc 
 |6 
 test/shaping/texts/in-tree/shaper-indic/indic/script-bengali/misc/reph.txt 
 |4 
 test/shaping/texts/in-tree/shaper-indic/indic/script-malayalam/misc/misc.txt   
 |1 
 test/shaping/texts/in-tree/shaper-indic/indic/script-sinhala/misc/misc.txt 
 |7 
 
test/shaping/texts/in-tree/shaper-indic/south-east-asian/script-khmer/misc/misc.txt
 |6 
 10 files changed, 172 insertions(+), 79 deletions(-)

New commits:
commit 65c43accdc4d2082282d5cedba8514b8df0c18a2
Author: Behdad Esfahbod 
Date:   Tue Jul 24 03:36:47 2012 -0400

[Indic] Better position left-matra in Malayalam

Just put it before base, which is what's expected.

Malayalam failures down from 1559 to 1197 (0.114172%).

BENGALI: 353988 out of 354285 tests passed. 297 failed (0.0838308%)
DEVANAGARI: 693571 out of 693628 tests passed. 57 failed (0.00821766%)
GUJARATI: 366489 out of 366506 tests passed. 17 failed (0.0046384%)
GURMUKHI: 60750 out of 60809 tests passed. 59 failed (0.0970251%)
KANNADA: 950956 out of 951913 tests passed. 957 failed (0.100534%)
KHMER: 299094 out of 299124 tests passed. 30 failed (0.0100293%)
MALAYALAM: 1047219 out of 1048416 tests passed. 1197 failed (0.114172%)
ORIYA: 42320 out of 42329 tests passed. 9 failed (0.021262%)
SINHALA: 271699 out of 271847 tests passed. 148 failed (0.0544424%)
TAMIL: 1091837 out of 1091837 tests passed. 0 failed (0%)
TELUGU: 970524 out of 970573 tests passed. 49 failed (0.00504856%)

diff --git a/src/hb-ot-shape-complex-indic.cc b/src/hb-ot-shape-complex-indic.cc
index d90d238..d0c3c09 100644
--- a/src/hb-ot-shape-complex-indic.cc
+++ b/src/hb-ot-shape-complex-indic.cc
@@ -895,22 +895,37 @@ final_reordering_syllable (hb_buffer_t *buffer,
* halant, position is moved after it.
*/
 
-  if (start < base) /* Otherwise there can't be any pre-base matra characters. 
*/
+  if (start + 1 < end && start < base) /* Otherwise there can't be any 
pre-base matra characters. */
   {
-unsigned int new_pos = base - 1;
-while (new_pos > start &&
-  !(is_one_of (info[new_pos], (FLAG (OT_M) | FLAG (OT_H) | FLAG 
(OT_Coeng)
-  new_pos--;
-/* If we found no Halant we are done (just need to update clusters).
- * Otherwise only proceed if the Halant does
- * not belong to the Matra itself! */
-if (is_halant_or_coeng (info[new_pos]) &&
-   info[new_pos].indic_position() != POS_PRE_M)
+/* If we lost track of base, alas, position before last thingy. */
+unsigned int new_pos = base == end ? base - 2 : base - 1;
+
+/* Malayalam does not have "half" forms or explicit virama forms.
+ * The glyphs formed by 'half' are Chillus.  We want to position
+ * matra after them all.
+ */
+if (buffer->props.script != HB_SCRIPT_MALAYALAM)
 {
-  /* -> If ZWJ or ZWNJ follow this halant, position is moved after it. */
-  if (new_pos + 1 < end && is_joiner (info[new_pos + 1]))
-   new_pos++;
+  while (new_pos > start &&
+!(is_one_of (info[new_pos], (FLAG (OT_M) | FLAG (OT_H) | FLAG 
(OT_Coeng)
+   new_pos--;
+
+  /* If we found no Halant we are done.
+   * Otherwise only proceed if the Halant does
+   * not belong to the Matra itself! */
+  if (is_halant_or_coeng (info[new_pos]) &&
+ info[new_pos].indic_position() != POS_PRE_M)
+  {
+   /* -> If ZWJ or ZWNJ follow this halant, position is moved after it. */
+   if (new_pos + 1 < end && is_joiner (info[new_pos + 1]))
+ new_pos++;
+  }
+  else
+new_pos = start; /* No move. */
+}
 
+if (start < new_pos)
+{
   /* Now go see if there's actually any matras... */
   for (unsigned int i = new_pos; i > start; i--)
if (info[i - 1].indic_position () == POS_PRE_M)
diff --git 
a/test/shaping/texts/in-tree/shaper-indic/indic/script-malayalam/misc/misc.txt 
b/test/shaping/texts/in-tree/shaper-indic/indic/script-malayalam/misc/misc.txt
index 3072b0a..ffb408d 100644
--- 
a/test/shaping/texts/in-tree/shaper-indic/indic/script-malayalam/misc/misc.txt
+++ 
b/test/shaping/texts/in-tree/shaper-indic/indic/script-malayalam/misc/misc.txt
@@ -58,3 +58,4 @@
 ള്‍
 ള്യം
 ള്ള
+ല്‍പ്പേ
commit 88f413b56f2858d149e2fc067685aeecaea779ca
Author: Behdad Esfahbod 
Date:   Tue Jul 24 03:04:36 2012 -0400

[Indic] Implement R