[HarfBuzz] harfbuzz-ng: Branch 'master' - 5 commits

Behdad Esfahbod Wed, 09 May 2012 01:12:48 -0700

 test/shaping/Makefile.am      |    2 
 test/shaping/hb-diff-ngrams   |    5 +
 test/shaping/hb-diff-stat     |    5 +
 test/shaping/hb_test_tools.py |  174 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 177 insertions(+), 9 deletions(-)


New commits:
commit 2214a03900d32710573a1b05c7665195b3129761
Author: Behdad Esfahbod <beh...@behdad.org>
Date:   Wed May 9 09:54:54 2012 +0200

    Add hb-diff-ngrams

diff --git a/test/shaping/Makefile.am b/test/shaping/Makefile.am
index 81c9991..4fb762c 100644
--- a/test/shaping/Makefile.am
+++ b/test/shaping/Makefile.am
@@ -13,6 +13,7 @@ EXTRA_DIST += \
        hb-diff \
        hb-diff-colorize \
        hb-diff-filter-failures \
+       hb-diff-ngrams \
        hb-diff-stat \
        hb-manifest-read \
        hb-manifest-update \
diff --git a/test/shaping/hb-diff-ngrams b/test/shaping/hb-diff-ngrams
new file mode 100755
index 0000000..a496447
--- /dev/null
+++ b/test/shaping/hb-diff-ngrams
@@ -0,0 +1,5 @@
+#!/usr/bin/python
+
+from hb_test_tools import *
+
+UtilMains.process_multiple_files (DiffSinks.print_ngrams)
diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index a38f067..3ff75b8 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -155,12 +155,60 @@ class DiffFilters:
                        if not DiffHelpers.test_passed (lines):
                                for l in lines: yield l
 
+class Stat:
+
+       def __init__ (self):
+               self.count = 0
+               self.freq = 0
+
+       def add (self, test):
+               self.count += 1
+               self.freq += test.freq
+
+class Stats:
+
+       def __init__ (self):
+               self.passed = Stat ()
+               self.failed = Stat ()
+               self.total  = Stat ()
+
+       def add (self, test):
+               self.total.add (test)
+               if test.passed:
+                       self.passed.add (test)
+               else:
+                       self.failed.add (test)
+
+       def mean (self):
+               return float (self.passed.count) / self.total.count
+
+       def variance (self):
+               return (float (self.passed.count) / self.total.count) * \
+                      (float (self.failed.count) / self.total.count)
+
+       def stddev (self):
+               return self.variance () ** .5
+
+       def zscore (self, population):
+               """Calculate the standard score.
+                  Population is the Stats for population.
+                  Self is Stats for sample.
+                  Returns larger absolute value if sample is highly unlikely 
to be random.
+                  Anything outside of -3..+3 is very unlikely to be random.
+                  See: http://en.wikipedia.org/wiki/Standard_score""";
+
+               return (self.mean () - population.mean ()) / population.stddev 
()
+
+
+
+
 class DiffSinks:
 
        @staticmethod
        def print_stat (f):
                passed = 0
                failed = 0
+               # XXX port to Stats, but that would really slow us down here
                for key, lines in DiffHelpers.separate_test_cases (f):
                        if DiffHelpers.test_passed (lines):
                                passed += 1
@@ -172,21 +220,34 @@ class DiffSinks:
        @staticmethod
        def print_ngrams (f, ns=(1,2,3)):
                gens = tuple (Ngram.generator (n) for n in ns)
+               allstats = Stats ()
+               allgrams = {}
                for key, lines in DiffHelpers.separate_test_cases (f):
                        test = Test (lines)
-                       unicodes = test.unicodes
-                       del test
+                       allstats.add (test)
 
                        for gen in gens:
-                               print "Printing %d-grams:" % gen.n
-                               for ngram in gen (unicodes):
-                                       print ngram
+                               for ngram in gen (test.unicodes):
+                                       if ngram not in allgrams:
+                                               allgrams[ngram] = Stats ()
+                                       allgrams[ngram].add (test)
+
+               importantgrams = {}
+               for ngram, stats in allgrams.iteritems ():
+                       if stats.failed.count >= 30: # for statistical reasons
+                               importantgrams[ngram] = stats
+               allgrams = importantgrams
+               del importantgrams
+
+               for ngram, stats in allgrams.iteritems ():
+                       print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" 
% (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join 
("U+%04X" % u for u in ngram))
 
 
 
 class Test:
 
        def __init__ (self, lines):
+               self.freq = 1
                self.passed = True
                self.identifier = None
                self.text = None
commit 178e6dce01ad28c8708bad62ce0fb79c46e836dc
Author: Behdad Esfahbod <beh...@behdad.org>
Date:   Wed May 9 08:57:29 2012 +0200

    Add N-gram generator

diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index d3c0939..a38f067 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -169,6 +169,53 @@ class DiffSinks:
                total = passed + failed
                print "%d out of %d tests passed.  %d failed (%g%%)" % (passed, 
total, failed, 100. * failed / total)
 
+       @staticmethod
+       def print_ngrams (f, ns=(1,2,3)):
+               gens = tuple (Ngram.generator (n) for n in ns)
+               for key, lines in DiffHelpers.separate_test_cases (f):
+                       test = Test (lines)
+                       unicodes = test.unicodes
+                       del test
+
+                       for gen in gens:
+                               print "Printing %d-grams:" % gen.n
+                               for ngram in gen (unicodes):
+                                       print ngram
+
+
+
+class Test:
+
+       def __init__ (self, lines):
+               self.passed = True
+               self.identifier = None
+               self.text = None
+               self.unicodes = None
+               self.glyphs = None
+               for l in lines:
+                       symbol = l[0]
+                       if symbol != ' ':
+                               self.passed = False
+                       i = 1
+                       if ':' in l:
+                               i = l.index (':')
+                               if not self.identifier:
+                                       self.identifier = l[1:i]
+                               i = i + 2 # Skip colon and space
+                       j = -1
+                       if l[j] == '\n':
+                               j -= 1
+                       brackets = l[i] + l[j]
+                       l = l[i+1:-2]
+                       if brackets == '()':
+                               self.text = l
+                       elif brackets == '<>':
+                               self.unicodes = Unicode.parse (l)
+                       elif brackets == '[]':
+                               # XXX we don't handle failed tests here
+                               self.glyphs = l
+
+
 class DiffHelpers:
 
        @staticmethod
@@ -205,6 +252,23 @@ class FilterHelpers:
                return printer
 
 
+class Ngram:
+
+       @staticmethod
+       def generator (n):
+
+               def gen (f):
+                       l = []
+                       for x in f:
+                               l.append (x)
+                               if len (l) == n:
+                                       yield tuple (l)
+                                       l[:1] = []
+
+               gen.n = n
+               return gen
+
+
 class UtilMains:
 
        @staticmethod
@@ -276,10 +340,14 @@ class Unicode:
                return '<' + u','.join ("U+%04X" % ord (u) for u in unicode (s, 
'utf-8')).encode ('utf-8') + '>'
 
        @staticmethod
-       def encode (s):
+       def parse (s):
                s = re.sub (r"[<+>,\\uU\n       ]", " ", s)
                s = re.sub (r"0[xX]", " ", s)
-               return u''.join (unichr (int (x, 16)) for x in s.split (' ') if 
len (x)).encode ('utf-8')
+               return [int (x, 16) for x in s.split (' ') if len (x)]
+
+       @staticmethod
+       def encode (s):
+               return u''.join (unichr (x) for x in Unicode.parse (s)).encode 
('utf-8')
 
        shorthands = {
                "ZERO WIDTH NON-JOINER": "ZWNJ",
commit 98669ceb77657d60435f2cb2e3fc18272c0a2c6a
Author: Behdad Esfahbod <beh...@behdad.org>
Date:   Wed May 9 08:16:15 2012 +0200

    Use groupby()

diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index 70a9ce1..d3c0939 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -150,7 +150,8 @@ class DiffFilters:
 
        @staticmethod
        def filter_failures (f):
-               for lines in DiffHelpers.separate_test_cases (f):
+               for key, lines in DiffHelpers.separate_test_cases (f):
+                       lines = list (lines)
                        if not DiffHelpers.test_passed (lines):
                                for l in lines: yield l
 
@@ -160,7 +161,7 @@ class DiffSinks:
        def print_stat (f):
                passed = 0
                failed = 0
-               for lines in DiffHelpers.separate_test_cases (f):
+               for key, lines in DiffHelpers.separate_test_cases (f):
                        if DiffHelpers.test_passed (lines):
                                passed += 1
                        else:
@@ -176,22 +177,11 @@ class DiffHelpers:
                   have a colon character, groups them by identifier,
                   yielding lists of all lines with the same identifier.'''
 
-               acc = []
-               iden = None
-               for l in f:
-                       if ':' not in l:
-                               if acc: yield acc
-                               acc = []
-                               iden = None
-                               yield [l]
-                               continue
-                       l_iden = l[1:l.index (':')]
-                       if acc and iden != l_iden:
-                               yield acc
-                               acc = []
-                       iden = l_iden
-                       acc.append (l)
-               if acc: yield acc
+               def identifier (l):
+                       if ':' in l[1:]:
+                               return l[1:l.index (':')]
+                       return l
+               return groupby (f, key=identifier)
 
        @staticmethod
        def test_passed (lines):
commit c438a14b62433db488b5c90854a4a3934adf3305
Author: Behdad Esfahbod <beh...@behdad.org>
Date:   Wed May 9 07:45:17 2012 +0200

    Add hb-diff-stat

diff --git a/test/shaping/Makefile.am b/test/shaping/Makefile.am
index f216c5d..81c9991 100644
--- a/test/shaping/Makefile.am
+++ b/test/shaping/Makefile.am
@@ -13,6 +13,7 @@ EXTRA_DIST += \
        hb-diff \
        hb-diff-colorize \
        hb-diff-filter-failures \
+       hb-diff-stat \
        hb-manifest-read \
        hb-manifest-update \
        hb-unicode-decode \
diff --git a/test/shaping/hb-diff-stat b/test/shaping/hb-diff-stat
new file mode 100755
index 0000000..81626e1
--- /dev/null
+++ b/test/shaping/hb-diff-stat
@@ -0,0 +1,5 @@
+#!/usr/bin/python
+
+from hb_test_tools import *
+
+UtilMains.process_multiple_files (DiffSinks.print_stat)
diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index 17181ac..70a9ce1 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -151,9 +151,23 @@ class DiffFilters:
        @staticmethod
        def filter_failures (f):
                for lines in DiffHelpers.separate_test_cases (f):
-                       if any (l[0] != ' ' for l in lines):
+                       if not DiffHelpers.test_passed (lines):
                                for l in lines: yield l
 
+class DiffSinks:
+
+       @staticmethod
+       def print_stat (f):
+               passed = 0
+               failed = 0
+               for lines in DiffHelpers.separate_test_cases (f):
+                       if DiffHelpers.test_passed (lines):
+                               passed += 1
+                       else:
+                               failed += 1
+               total = passed + failed
+               print "%d out of %d tests passed.  %d failed (%g%%)" % (passed, 
total, failed, 100. * failed / total)
+
 class DiffHelpers:
 
        @staticmethod
@@ -175,10 +189,14 @@ class DiffHelpers:
                        if acc and iden != l_iden:
                                yield acc
                                acc = []
-                               iden = l_iden
+                       iden = l_iden
                        acc.append (l)
                if acc: yield acc
 
+       @staticmethod
+       def test_passed (lines):
+               return all (l[0] == ' ' for l in lines)
+
 
 class FilterHelpers:
 
commit 1058d031e2046eb80331b0950eaff75c2bf608dc
Author: Behdad Esfahbod <beh...@behdad.org>
Date:   Wed May 9 07:30:07 2012 +0200

    Make hb-diff-filter-failtures retain all test info for failed tests

diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index 03a7710..17181ac 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -149,16 +149,35 @@ class ZipDiffer:
 class DiffFilters:
 
        @staticmethod
-       def filter_failures (f, symbols=diff_symbols):
-               for l in f:
-                       if l[0] in symbols:
-                               # TODO retain all lines of the failure
-                               yield l
+       def filter_failures (f):
+               for lines in DiffHelpers.separate_test_cases (f):
+                       if any (l[0] != ' ' for l in lines):
+                               for l in lines: yield l
 
+class DiffHelpers:
 
-class ShapeFilters:
+       @staticmethod
+       def separate_test_cases (f):
+               '''Reads lines from f, and if the lines have identifiers, ie.
+                  have a colon character, groups them by identifier,
+                  yielding lists of all lines with the same identifier.'''
 
-       pass
+               acc = []
+               iden = None
+               for l in f:
+                       if ':' not in l:
+                               if acc: yield acc
+                               acc = []
+                               iden = None
+                               yield [l]
+                               continue
+                       l_iden = l[1:l.index (':')]
+                       if acc and iden != l_iden:
+                               yield acc
+                               acc = []
+                               iden = l_iden
+                       acc.append (l)
+               if acc: yield acc
 
 
 class FilterHelpers:
_______________________________________________
HarfBuzz mailing list
HarfBuzz@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/harfbuzz

[HarfBuzz] harfbuzz-ng: Branch 'master' - 5 commits

Reply via email to