Author: Amaury Forgeot d'Arc <[email protected]>
Branch: py3.3
Changeset: r76424:751c321375f4
Date: 2015-03-17 10:13 +0100
http://bitbucket.org/pypy/pypy/changeset/751c321375f4/
Log: Unicodedata: ensure that PUA codes cannot be used to retrieve
aliases.
diff --git a/pypy/module/unicodedata/interp_ucd.py
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -105,7 +105,7 @@
@unwrap_spec(name=str)
def lookup(self, space, name):
try:
- code = self._lookup(name.upper())
+ code = self._lookup(name.upper(), with_named_sequence=True)
except KeyError:
msg = space.mod(space.wrap("undefined character name '%s'"),
space.wrap(name))
raise OperationError(space.w_KeyError, msg)
diff --git a/pypy/module/unicodedata/test/test_unicodedata.py
b/pypy/module/unicodedata/test/test_unicodedata.py
--- a/pypy/module/unicodedata/test/test_unicodedata.py
+++ b/pypy/module/unicodedata/test/test_unicodedata.py
@@ -139,4 +139,11 @@
]
for seqname, codepoints in sequences:
assert unicodedata.lookup(seqname) == codepoints
+ raises(SyntaxError, eval, r'"\N{%s}"' % seqname)
+ def test_names_in_pua_range(self):
+ # We are storing named seq in the PUA 15, but their names shouldn't
leak
+ import unicodedata
+ for cp in range(0xf0000, 0xf0300, 7):
+ exc = raises(ValueError, unicodedata.name, chr(cp))
+ assert str(exc.value) == 'no such name'
diff --git a/rpython/rlib/unicodedata/generate_unicodedb.py
b/rpython/rlib/unicodedata/generate_unicodedb.py
--- a/rpython/rlib/unicodedata/generate_unicodedb.py
+++ b/rpython/rlib/unicodedata/generate_unicodedb.py
@@ -591,29 +591,32 @@
if not ('0' <= c <= '9' or 'A' <= c <= 'F'):
raise KeyError
code = int(cjk_code, 16)
- if %s:
+ if %(cjk_interval)s:
return code
raise KeyError
-def lookup(name):
+def lookup(name, with_named_sequence=False):
if name[:len(_cjk_prefix)] == _cjk_prefix:
return _lookup_cjk(name[len(_cjk_prefix):])
if name[:len(_hangul_prefix)] == _hangul_prefix:
return _lookup_hangul(name[len(_hangul_prefix):])
if not base_mod:
- return trie_lookup(name)
+ code = trie_lookup(name)
else:
try:
- return _code_by_name[name]
+ code = _code_by_name[name]
except KeyError:
if name not in _code_by_name_corrected:
- return base_mod.trie_lookup(name)
+ code = base_mod.trie_lookup(name)
else:
raise
+ if not with_named_sequence and %(named_sequence_interval)s:
+ raise KeyError
+ return code
def name(code):
- if %s:
+ if %(cjk_interval)s:
return "CJK UNIFIED IDEOGRAPH-" + hex(code)[2:].upper()
if 0xAC00 <= code <= 0xD7A3:
# vl_code, t_code = divmod(code - 0xAC00, len(_hangul_T))
@@ -624,6 +627,8 @@
v_code = vl_code %% len(_hangul_V)
return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
_hangul_V[v_code] + _hangul_T[t_code])
+ if %(pua_interval)s:
+ raise KeyError
if not base_mod:
return lookup_charcode(code)
@@ -635,7 +640,9 @@
return base_mod.lookup_charcode(code)
else:
raise
-''' % (cjk_interval, cjk_interval)
+''' % dict(cjk_interval=cjk_interval,
+ pua_interval="0xF0000 <= code < 0xF0400",
+ named_sequence_interval="0xF0200 <= code < 0xF0400")
# Categories
writeDbRecord(outfile, table)
@@ -810,8 +817,8 @@
print >> outfile, ']'
print >> outfile, '''
-def lookup_with_alias(name):
- code = lookup(name)
+def lookup_with_alias(name, with_named_sequence=False):
+ code = lookup(name, with_named_sequence=with_named_sequence)
if 0 <= code - %(start)s < len(_name_aliases):
return _name_aliases[code - %(start)s]
else:
diff --git a/rpython/rlib/unicodedata/unicodedb_3_2_0.py
b/rpython/rlib/unicodedata/unicodedb_3_2_0.py
--- a/rpython/rlib/unicodedata/unicodedb_3_2_0.py
+++ b/rpython/rlib/unicodedata/unicodedb_3_2_0.py
@@ -16788,22 +16788,25 @@
return code
raise KeyError
-def lookup(name):
+def lookup(name, with_named_sequence=False):
if name[:len(_cjk_prefix)] == _cjk_prefix:
return _lookup_cjk(name[len(_cjk_prefix):])
if name[:len(_hangul_prefix)] == _hangul_prefix:
return _lookup_hangul(name[len(_hangul_prefix):])
if not base_mod:
- return trie_lookup(name)
+ code = trie_lookup(name)
else:
try:
- return _code_by_name[name]
+ code = _code_by_name[name]
except KeyError:
if name not in _code_by_name_corrected:
- return base_mod.trie_lookup(name)
+ code = base_mod.trie_lookup(name)
else:
raise
+ if not with_named_sequence and 0xF0200 <= code < 0xF0400:
+ raise KeyError
+ return code
def name(code):
if (0x3400 <= code <= 0x4DB5 or 0x4E00 <= code <= 0x9FA5 or 0x20000 <=
code <= 0x2A6D6):
@@ -16817,6 +16820,8 @@
v_code = vl_code % len(_hangul_V)
return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
_hangul_V[v_code] + _hangul_T[t_code])
+ if 0xF0000 <= code < 0xF0400:
+ raise KeyError
if not base_mod:
return lookup_charcode(code)
@@ -21288,8 +21293,8 @@
]
-def lookup_with_alias(name):
- code = lookup(name)
+def lookup_with_alias(name, with_named_sequence=False):
+ code = lookup(name, with_named_sequence=with_named_sequence)
if 0 <= code - 983040 < len(_name_aliases):
return _name_aliases[code - 983040]
else:
diff --git a/rpython/rlib/unicodedata/unicodedb_5_2_0.py
b/rpython/rlib/unicodedata/unicodedb_5_2_0.py
--- a/rpython/rlib/unicodedata/unicodedb_5_2_0.py
+++ b/rpython/rlib/unicodedata/unicodedb_5_2_0.py
@@ -136803,22 +136803,25 @@
return code
raise KeyError
-def lookup(name):
+def lookup(name, with_named_sequence=False):
if name[:len(_cjk_prefix)] == _cjk_prefix:
return _lookup_cjk(name[len(_cjk_prefix):])
if name[:len(_hangul_prefix)] == _hangul_prefix:
return _lookup_hangul(name[len(_hangul_prefix):])
if not base_mod:
- return trie_lookup(name)
+ code = trie_lookup(name)
else:
try:
- return _code_by_name[name]
+ code = _code_by_name[name]
except KeyError:
if name not in _code_by_name_corrected:
- return base_mod.trie_lookup(name)
+ code = base_mod.trie_lookup(name)
else:
raise
+ if not with_named_sequence and 0xF0200 <= code < 0xF0400:
+ raise KeyError
+ return code
def name(code):
if (0x3400 <= code <= 0x4DB5 or 0x4E00 <= code <= 0x9FCB or 0x20000 <=
code <= 0x2A6D6 or 0x2A700 <= code <= 0x2B734):
@@ -136832,6 +136835,8 @@
v_code = vl_code % len(_hangul_V)
return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
_hangul_V[v_code] + _hangul_T[t_code])
+ if 0xF0000 <= code < 0xF0400:
+ raise KeyError
if not base_mod:
return lookup_charcode(code)
@@ -157497,8 +157502,8 @@
]
-def lookup_with_alias(name):
- code = lookup(name)
+def lookup_with_alias(name, with_named_sequence=False):
+ code = lookup(name, with_named_sequence=with_named_sequence)
if 0 <= code - 983040 < len(_name_aliases):
return _name_aliases[code - 983040]
else:
diff --git a/rpython/rlib/unicodedata/unicodedb_6_0_0.py
b/rpython/rlib/unicodedata/unicodedb_6_0_0.py
--- a/rpython/rlib/unicodedata/unicodedb_6_0_0.py
+++ b/rpython/rlib/unicodedata/unicodedb_6_0_0.py
@@ -4520,22 +4520,25 @@
return code
raise KeyError
-def lookup(name):
+def lookup(name, with_named_sequence=False):
if name[:len(_cjk_prefix)] == _cjk_prefix:
return _lookup_cjk(name[len(_cjk_prefix):])
if name[:len(_hangul_prefix)] == _hangul_prefix:
return _lookup_hangul(name[len(_hangul_prefix):])
if not base_mod:
- return trie_lookup(name)
+ code = trie_lookup(name)
else:
try:
- return _code_by_name[name]
+ code = _code_by_name[name]
except KeyError:
if name not in _code_by_name_corrected:
- return base_mod.trie_lookup(name)
+ code = base_mod.trie_lookup(name)
else:
raise
+ if not with_named_sequence and 0xF0200 <= code < 0xF0400:
+ raise KeyError
+ return code
def name(code):
if (0x3400 <= code <= 0x4DB5 or 0x4E00 <= code <= 0x9FCB or 0x20000 <=
code <= 0x2A6D6 or 0x2A700 <= code <= 0x2B734 or 0x2B740 <= code <= 0x2B81D):
@@ -4549,6 +4552,8 @@
v_code = vl_code % len(_hangul_V)
return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
_hangul_V[v_code] + _hangul_T[t_code])
+ if 0xF0000 <= code < 0xF0400:
+ raise KeyError
if not base_mod:
return lookup_charcode(code)
@@ -7240,8 +7245,8 @@
]
-def lookup_with_alias(name):
- code = lookup(name)
+def lookup_with_alias(name, with_named_sequence=False):
+ code = lookup(name, with_named_sequence=with_named_sequence)
if 0 <= code - 983040 < len(_name_aliases):
return _name_aliases[code - 983040]
else:
diff --git a/rpython/rlib/unicodedata/unicodedb_6_2_0.py
b/rpython/rlib/unicodedata/unicodedb_6_2_0.py
--- a/rpython/rlib/unicodedata/unicodedb_6_2_0.py
+++ b/rpython/rlib/unicodedata/unicodedb_6_2_0.py
@@ -6890,22 +6890,25 @@
return code
raise KeyError
-def lookup(name):
+def lookup(name, with_named_sequence=False):
if name[:len(_cjk_prefix)] == _cjk_prefix:
return _lookup_cjk(name[len(_cjk_prefix):])
if name[:len(_hangul_prefix)] == _hangul_prefix:
return _lookup_hangul(name[len(_hangul_prefix):])
if not base_mod:
- return trie_lookup(name)
+ code = trie_lookup(name)
else:
try:
- return _code_by_name[name]
+ code = _code_by_name[name]
except KeyError:
if name not in _code_by_name_corrected:
- return base_mod.trie_lookup(name)
+ code = base_mod.trie_lookup(name)
else:
raise
+ if not with_named_sequence and 0xF0200 <= code < 0xF0400:
+ raise KeyError
+ return code
def name(code):
if (0x3400 <= code <= 0x4DB5 or 0x4E00 <= code <= 0x9FCB or 0x20000 <=
code <= 0x2A6D6 or 0x2A700 <= code <= 0x2B734 or 0x2B740 <= code <= 0x2B81D):
@@ -6919,6 +6922,8 @@
v_code = vl_code % len(_hangul_V)
return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
_hangul_V[v_code] + _hangul_T[t_code])
+ if 0xF0000 <= code < 0xF0400:
+ raise KeyError
if not base_mod:
return lookup_charcode(code)
@@ -10521,8 +10526,8 @@
]
-def lookup_with_alias(name):
- code = lookup(name)
+def lookup_with_alias(name, with_named_sequence=False):
+ code = lookup(name, with_named_sequence=with_named_sequence)
if 0 <= code - 983040 < len(_name_aliases):
return _name_aliases[code - 983040]
else:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit