Serhiy Storchaka added the comment:
Good catch Antoine!
Here is a sample of more complicated implementation.
----------
title: Add a convert_surrogates function to "clean" surrogate escaped strings
-> Add codecs.convert_surrogateescape to "clean" surrogate escaped strings
Added file: http://bugs.python.org/file36700/convert_surrogates.py
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue18814>
_______________________________________
import codecs
import re
def convert_surrogates(data, errors='strict'):
handler = None
p = re.compile('[\ud800-\uefff]+')
pos = 0
res = []
while True:
m = p.search(data, pos)
if m:
if handler is None:
handler = codecs.lookup_error(errors)
res.append(data[pos: m.start()])
repl, pos = handler(UnicodeTranslateError(data, m.start(), m.end(),
'lone surrogates'))
res.append(repl)
elif pos:
res.append(data[pos:])
return ''.join(res)
else:
return data
def convert_surrogateescape(data, errors='strict'):
handler = None
p = re.compile('[\ud800-\uefff]+')
pos = 0
res = []
while True:
m = p.search(data, pos)
if m:
if handler is None:
handler = codecs.lookup_error(errors)
start = m.start()
res.append(data[pos: start])
try:
baddata = data[start: m.end()].encode('ascii',
'surrogateescape')
except UnicodeEncodeError as err:
raise UnicodeTranslateError(data,
err.start + start,err.end + start,
r'surrogates not in range \ud880-\ud8ff') from None
try:
repl, pos = handler(UnicodeDecodeError('unicode', baddata,
0, len(baddata),
'lone surrogates'))
except UnicodeDecodeError as err:
raise UnicodeTranslateError(data,
err.start + start,
err.end + start,
err.reason) from None
pos += start
res.append(repl)
elif pos:
res.append(data[pos:])
return ''.join(res)
else:
return data
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
https://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com