On Thursday 09 August 2007 17:40:58 I wrote:
> My attached patch fix both bugs:
> - convert bytes to str8 in _compile() to be able to hash it
> - add a special version of escape() for bytes
My first try was buggy for this snippet code:
import re
assert type(re.sub(b'', b'', b'')) is bytes
assert type(re.sub(b'(x)', b'[\\1]', b'x')) is bytes
My first patch mix bytes and str8 and so re.sub fails in some cases.
So here is a new patch using str8 in dictionary key and str in regex parsing
(sre_parse.py) (and then reconvert to bytes for 'literals' variable).
Victor Stinner
http://hachoir.org/
Index: Lib/re.py
===================================================================
--- Lib/re.py (révision 56838)
+++ Lib/re.py (copie de travail)
@@ -193,18 +193,34 @@
_alphanum[c] = 1
del c
+_alphanum_bytes = set(b'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890')
+
def escape(pattern):
"Escape all non-alphanumeric characters in pattern."
- s = list(pattern)
- alphanum = _alphanum
- for i in range(len(pattern)):
- c = pattern[i]
- if c not in alphanum:
- if c == "\000":
- s[i] = "\\000"
+ if isinstance(pattern, bytes):
+ alphanum = _alphanum_bytes
+ s = b''
+ for c in pattern:
+ if c not in alphanum:
+ if not c:
+ s += b"\\000"
+ else:
+ s.append(92)
+ s.append(c)
else:
- s[i] = "\\" + c
- return pattern[:0].join(s)
+ s.append(c)
+ return s
+ else:
+ alphanum = _alphanum
+ s = list(pattern)
+ for i in range(len(pattern)):
+ c = pattern[i]
+ if c not in alphanum:
+ if c == "\000":
+ s[i] = "\\000"
+ else:
+ s[i] = "\\" + c
+ return ''.join(s)
# --------------------------------------------------------------------
# internals
@@ -218,7 +234,10 @@
def _compile(*key):
# internal: compile pattern
- cachekey = (type(key[0]),) + key
+ if isinstance(key[0], bytes):
+ cachekey = (type(key[0]), str8(key[0]), key[1])
+ else:
+ cachekey = (type(key[0]),) + key
p = _cache.get(cachekey)
if p is not None:
return p
@@ -236,12 +255,20 @@
_cache[cachekey] = p
return p
-def _compile_repl(*key):
+def _compile_repl(repl, pattern):
# internal: compile replacement pattern
+ if isinstance(repl, bytes):
+ cacherepl = str8(repl)
+ else:
+ cacherepl = repl
+ if isinstance(pattern, bytes):
+ cachepattern = str8(pattern)
+ else:
+ cachepattern = pattern
+ key = (cacherepl, cachepattern)
p = _cache_repl.get(key)
if p is not None:
return p
- repl, pattern = key
try:
p = sre_parse.parse_template(repl, pattern)
except error as v:
Index: Lib/test/test_re.py
===================================================================
--- Lib/test/test_re.py (révision 56838)
+++ Lib/test/test_re.py (copie de travail)
@@ -397,18 +397,32 @@
self.assertEqual(re.search("\s(b)", " b").group(1), "b")
self.assertEqual(re.search("a\s", "a ").group(0), "a ")
- def test_re_escape(self):
- p=""
- for i in range(0, 256):
- p = p + chr(i)
- self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
- True)
- self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
+ def _test_re_escape(self, use_bytes):
+ if use_bytes:
+ p=bytes()
+ for i in range(0, 256):
+ p.append(i)
+ self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
+ True)
+ self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
+ else:
+ p=""
+ for i in range(0, 256):
+ p = p + chr(i)
+ self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
+ True)
+ self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
pat=re.compile(re.escape(p))
self.assertEqual(pat.match(p) is not None, True)
self.assertEqual(pat.match(p).span(), (0,256))
+ def test_re_escape_str(self):
+ self._test_re_escape(False)
+
+ def test_re_escape_bytes(self):
+ self._test_re_escape(True)
+
def test_pickling(self):
import pickle
self.pickle_test(pickle)
Index: Lib/sre_compile.py
===================================================================
--- Lib/sre_compile.py (révision 56838)
+++ Lib/sre_compile.py (copie de travail)
@@ -472,7 +472,7 @@
code[skip] = len(code) - skip
def isstring(obj):
- return isinstance(obj, basestring)
+ return isinstance(obj, basestring) or isinstance(obj, bytes)
def _code(p, flags):
Index: Lib/sre_parse.py
===================================================================
--- Lib/sre_parse.py (révision 56838)
+++ Lib/sre_parse.py (copie de travail)
@@ -184,6 +184,9 @@
class Tokenizer:
def __init__(self, string):
+ self.use_bytes = isinstance(string, bytes)
+ if self.use_bytes:
+ string = str(string, "latin-1")
self.string = string
self.index = 0
self.__next()
@@ -701,6 +704,7 @@
# group references
s = Tokenizer(source)
sget = s.get
+ use_bytes = s.use_bytes
p = []
a = p.append
def literal(literal, p=p, pappend=a):
@@ -779,7 +783,10 @@
groupsappend((i, s))
# literal[i] is already None
else:
- literals[i] = s
+ if use_bytes:
+ literals[i] = bytes(s)
+ else:
+ literals[i] = s
i = i + 1
return groups, literals
_______________________________________________
Python-3000 mailing list
[email protected]
http://mail.python.org/mailman/listinfo/python-3000
Unsubscribe:
http://mail.python.org/mailman/options/python-3000/archive%40mail-archive.com