Author: Amaury Forgeot d'Arc <[email protected]>
Branch: py3.3
Changeset: r76424:751c321375f4
Date: 2015-03-17 10:13 +0100
http://bitbucket.org/pypy/pypy/changeset/751c321375f4/

Log:    Unicodedata: ensure that PUA codes cannot be used to retrieve
        aliases.

diff --git a/pypy/module/unicodedata/interp_ucd.py 
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -105,7 +105,7 @@
     @unwrap_spec(name=str)
     def lookup(self, space, name):
         try:
-            code = self._lookup(name.upper())
+            code = self._lookup(name.upper(), with_named_sequence=True)
         except KeyError:
             msg = space.mod(space.wrap("undefined character name '%s'"), 
space.wrap(name))
             raise OperationError(space.w_KeyError, msg)
diff --git a/pypy/module/unicodedata/test/test_unicodedata.py 
b/pypy/module/unicodedata/test/test_unicodedata.py
--- a/pypy/module/unicodedata/test/test_unicodedata.py
+++ b/pypy/module/unicodedata/test/test_unicodedata.py
@@ -139,4 +139,11 @@
         ]
         for seqname, codepoints in sequences:
             assert unicodedata.lookup(seqname) == codepoints
+            raises(SyntaxError, eval, r'"\N{%s}"' % seqname)
 
+    def test_names_in_pua_range(self):
+        # We are storing named seq in the PUA 15, but their names shouldn't 
leak
+        import unicodedata
+        for cp in range(0xf0000, 0xf0300, 7):
+            exc = raises(ValueError, unicodedata.name, chr(cp))
+            assert str(exc.value) == 'no such name'
diff --git a/rpython/rlib/unicodedata/generate_unicodedb.py 
b/rpython/rlib/unicodedata/generate_unicodedb.py
--- a/rpython/rlib/unicodedata/generate_unicodedb.py
+++ b/rpython/rlib/unicodedata/generate_unicodedb.py
@@ -591,29 +591,32 @@
         if not ('0' <= c <= '9' or 'A' <= c <= 'F'):
             raise KeyError
     code = int(cjk_code, 16)
-    if %s:
+    if %(cjk_interval)s:
         return code
     raise KeyError
 
-def lookup(name):
+def lookup(name, with_named_sequence=False):
     if name[:len(_cjk_prefix)] == _cjk_prefix:
         return _lookup_cjk(name[len(_cjk_prefix):])
     if name[:len(_hangul_prefix)] == _hangul_prefix:
         return _lookup_hangul(name[len(_hangul_prefix):])
 
     if not base_mod:
-        return trie_lookup(name)
+        code = trie_lookup(name)
     else:
         try:
-            return _code_by_name[name]
+            code = _code_by_name[name]
         except KeyError:
             if name not in _code_by_name_corrected:
-                return base_mod.trie_lookup(name)
+                code = base_mod.trie_lookup(name)
             else:
                 raise
+    if not with_named_sequence and %(named_sequence_interval)s:
+        raise KeyError
+    return code
 
 def name(code):
-    if %s:
+    if %(cjk_interval)s:
         return "CJK UNIFIED IDEOGRAPH-" + hex(code)[2:].upper()
     if 0xAC00 <= code <= 0xD7A3:
         # vl_code, t_code = divmod(code - 0xAC00, len(_hangul_T))
@@ -624,6 +627,8 @@
         v_code = vl_code %% len(_hangul_V)
         return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
                 _hangul_V[v_code] + _hangul_T[t_code])
+    if %(pua_interval)s:
+        raise KeyError
 
     if not base_mod:
         return lookup_charcode(code)
@@ -635,7 +640,9 @@
                 return base_mod.lookup_charcode(code)
             else:
                 raise
-''' % (cjk_interval, cjk_interval)
+''' % dict(cjk_interval=cjk_interval,
+           pua_interval="0xF0000 <= code < 0xF0400",
+           named_sequence_interval="0xF0200 <= code < 0xF0400")
 
     # Categories
     writeDbRecord(outfile, table)
@@ -810,8 +817,8 @@
     print >> outfile, ']'
     print >> outfile, '''
 
-def lookup_with_alias(name):
-    code = lookup(name)
+def lookup_with_alias(name, with_named_sequence=False):
+    code = lookup(name, with_named_sequence=with_named_sequence)
     if 0 <= code - %(start)s < len(_name_aliases):
         return _name_aliases[code - %(start)s]
     else:
diff --git a/rpython/rlib/unicodedata/unicodedb_3_2_0.py 
b/rpython/rlib/unicodedata/unicodedb_3_2_0.py
--- a/rpython/rlib/unicodedata/unicodedb_3_2_0.py
+++ b/rpython/rlib/unicodedata/unicodedb_3_2_0.py
@@ -16788,22 +16788,25 @@
         return code
     raise KeyError
 
-def lookup(name):
+def lookup(name, with_named_sequence=False):
     if name[:len(_cjk_prefix)] == _cjk_prefix:
         return _lookup_cjk(name[len(_cjk_prefix):])
     if name[:len(_hangul_prefix)] == _hangul_prefix:
         return _lookup_hangul(name[len(_hangul_prefix):])
 
     if not base_mod:
-        return trie_lookup(name)
+        code = trie_lookup(name)
     else:
         try:
-            return _code_by_name[name]
+            code = _code_by_name[name]
         except KeyError:
             if name not in _code_by_name_corrected:
-                return base_mod.trie_lookup(name)
+                code = base_mod.trie_lookup(name)
             else:
                 raise
+    if not with_named_sequence and 0xF0200 <= code < 0xF0400:
+        raise KeyError
+    return code
 
 def name(code):
     if (0x3400 <= code <= 0x4DB5 or 0x4E00 <= code <= 0x9FA5 or 0x20000 <= 
code <= 0x2A6D6):
@@ -16817,6 +16820,8 @@
         v_code = vl_code % len(_hangul_V)
         return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
                 _hangul_V[v_code] + _hangul_T[t_code])
+    if 0xF0000 <= code < 0xF0400:
+        raise KeyError
 
     if not base_mod:
         return lookup_charcode(code)
@@ -21288,8 +21293,8 @@
 ]
 
 
-def lookup_with_alias(name):
-    code = lookup(name)
+def lookup_with_alias(name, with_named_sequence=False):
+    code = lookup(name, with_named_sequence=with_named_sequence)
     if 0 <= code - 983040 < len(_name_aliases):
         return _name_aliases[code - 983040]
     else:
diff --git a/rpython/rlib/unicodedata/unicodedb_5_2_0.py 
b/rpython/rlib/unicodedata/unicodedb_5_2_0.py
--- a/rpython/rlib/unicodedata/unicodedb_5_2_0.py
+++ b/rpython/rlib/unicodedata/unicodedb_5_2_0.py
@@ -136803,22 +136803,25 @@
         return code
     raise KeyError
 
-def lookup(name):
+def lookup(name, with_named_sequence=False):
     if name[:len(_cjk_prefix)] == _cjk_prefix:
         return _lookup_cjk(name[len(_cjk_prefix):])
     if name[:len(_hangul_prefix)] == _hangul_prefix:
         return _lookup_hangul(name[len(_hangul_prefix):])
 
     if not base_mod:
-        return trie_lookup(name)
+        code = trie_lookup(name)
     else:
         try:
-            return _code_by_name[name]
+            code = _code_by_name[name]
         except KeyError:
             if name not in _code_by_name_corrected:
-                return base_mod.trie_lookup(name)
+                code = base_mod.trie_lookup(name)
             else:
                 raise
+    if not with_named_sequence and 0xF0200 <= code < 0xF0400:
+        raise KeyError
+    return code
 
 def name(code):
     if (0x3400 <= code <= 0x4DB5 or 0x4E00 <= code <= 0x9FCB or 0x20000 <= 
code <= 0x2A6D6 or 0x2A700 <= code <= 0x2B734):
@@ -136832,6 +136835,8 @@
         v_code = vl_code % len(_hangul_V)
         return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
                 _hangul_V[v_code] + _hangul_T[t_code])
+    if 0xF0000 <= code < 0xF0400:
+        raise KeyError
 
     if not base_mod:
         return lookup_charcode(code)
@@ -157497,8 +157502,8 @@
 ]
 
 
-def lookup_with_alias(name):
-    code = lookup(name)
+def lookup_with_alias(name, with_named_sequence=False):
+    code = lookup(name, with_named_sequence=with_named_sequence)
     if 0 <= code - 983040 < len(_name_aliases):
         return _name_aliases[code - 983040]
     else:
diff --git a/rpython/rlib/unicodedata/unicodedb_6_0_0.py 
b/rpython/rlib/unicodedata/unicodedb_6_0_0.py
--- a/rpython/rlib/unicodedata/unicodedb_6_0_0.py
+++ b/rpython/rlib/unicodedata/unicodedb_6_0_0.py
@@ -4520,22 +4520,25 @@
         return code
     raise KeyError
 
-def lookup(name):
+def lookup(name, with_named_sequence=False):
     if name[:len(_cjk_prefix)] == _cjk_prefix:
         return _lookup_cjk(name[len(_cjk_prefix):])
     if name[:len(_hangul_prefix)] == _hangul_prefix:
         return _lookup_hangul(name[len(_hangul_prefix):])
 
     if not base_mod:
-        return trie_lookup(name)
+        code = trie_lookup(name)
     else:
         try:
-            return _code_by_name[name]
+            code = _code_by_name[name]
         except KeyError:
             if name not in _code_by_name_corrected:
-                return base_mod.trie_lookup(name)
+                code = base_mod.trie_lookup(name)
             else:
                 raise
+    if not with_named_sequence and 0xF0200 <= code < 0xF0400:
+        raise KeyError
+    return code
 
 def name(code):
     if (0x3400 <= code <= 0x4DB5 or 0x4E00 <= code <= 0x9FCB or 0x20000 <= 
code <= 0x2A6D6 or 0x2A700 <= code <= 0x2B734 or 0x2B740 <= code <= 0x2B81D):
@@ -4549,6 +4552,8 @@
         v_code = vl_code % len(_hangul_V)
         return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
                 _hangul_V[v_code] + _hangul_T[t_code])
+    if 0xF0000 <= code < 0xF0400:
+        raise KeyError
 
     if not base_mod:
         return lookup_charcode(code)
@@ -7240,8 +7245,8 @@
 ]
 
 
-def lookup_with_alias(name):
-    code = lookup(name)
+def lookup_with_alias(name, with_named_sequence=False):
+    code = lookup(name, with_named_sequence=with_named_sequence)
     if 0 <= code - 983040 < len(_name_aliases):
         return _name_aliases[code - 983040]
     else:
diff --git a/rpython/rlib/unicodedata/unicodedb_6_2_0.py 
b/rpython/rlib/unicodedata/unicodedb_6_2_0.py
--- a/rpython/rlib/unicodedata/unicodedb_6_2_0.py
+++ b/rpython/rlib/unicodedata/unicodedb_6_2_0.py
@@ -6890,22 +6890,25 @@
         return code
     raise KeyError
 
-def lookup(name):
+def lookup(name, with_named_sequence=False):
     if name[:len(_cjk_prefix)] == _cjk_prefix:
         return _lookup_cjk(name[len(_cjk_prefix):])
     if name[:len(_hangul_prefix)] == _hangul_prefix:
         return _lookup_hangul(name[len(_hangul_prefix):])
 
     if not base_mod:
-        return trie_lookup(name)
+        code = trie_lookup(name)
     else:
         try:
-            return _code_by_name[name]
+            code = _code_by_name[name]
         except KeyError:
             if name not in _code_by_name_corrected:
-                return base_mod.trie_lookup(name)
+                code = base_mod.trie_lookup(name)
             else:
                 raise
+    if not with_named_sequence and 0xF0200 <= code < 0xF0400:
+        raise KeyError
+    return code
 
 def name(code):
     if (0x3400 <= code <= 0x4DB5 or 0x4E00 <= code <= 0x9FCB or 0x20000 <= 
code <= 0x2A6D6 or 0x2A700 <= code <= 0x2B734 or 0x2B740 <= code <= 0x2B81D):
@@ -6919,6 +6922,8 @@
         v_code = vl_code % len(_hangul_V)
         return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
                 _hangul_V[v_code] + _hangul_T[t_code])
+    if 0xF0000 <= code < 0xF0400:
+        raise KeyError
 
     if not base_mod:
         return lookup_charcode(code)
@@ -10521,8 +10526,8 @@
 ]
 
 
-def lookup_with_alias(name):
-    code = lookup(name)
+def lookup_with_alias(name, with_named_sequence=False):
+    code = lookup(name, with_named_sequence=with_named_sequence)
     if 0 <= code - 983040 < len(_name_aliases):
         return _name_aliases[code - 983040]
     else:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to