Re: [Python-3000] bytes regular expression?

Victor Stinner Thu, 09 Aug 2007 09:48:39 -0700

On Thursday 09 August 2007 17:40:58 I wrote:
> My attached patch fix both bugs:
>  - convert bytes to str8 in _compile() to be able to hash it
>  - add a special version of escape() for bytes


My first try was buggy for this snippet code:
   import re
   assert type(re.sub(b'', b'', b'')) is bytes
   assert type(re.sub(b'(x)', b'[\\1]', b'x')) is bytes

My first patch mix bytes and str8 and so re.sub fails in some cases.

So here is a new patch using str8 in dictionary key and str in regex parsing 
(sre_parse.py) (and then reconvert to bytes for 'literals' variable).

Victor Stinner
http://hachoir.org/

Index: Lib/re.py
===================================================================
--- Lib/re.py	(révision 56838)
+++ Lib/re.py	(copie de travail)
@@ -193,18 +193,34 @@
     _alphanum[c] = 1
 del c
 
+_alphanum_bytes = set(b'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890')
+
 def escape(pattern):
     "Escape all non-alphanumeric characters in pattern."
-    s = list(pattern)
-    alphanum = _alphanum
-    for i in range(len(pattern)):
-        c = pattern[i]
-        if c not in alphanum:
-            if c == "\000":
-                s[i] = "\\000"
+    if isinstance(pattern, bytes):
+        alphanum = _alphanum_bytes
+        s = b''
+        for c in pattern:
+            if c not in alphanum:
+                if not c:
+                    s += b"\\000"
+                else:
+                    s.append(92)
+                    s.append(c)
             else:
-                s[i] = "\\" + c
-    return pattern[:0].join(s)
+                s.append(c)
+        return s
+    else:
+        alphanum = _alphanum
+        s = list(pattern)
+        for i in range(len(pattern)):
+            c = pattern[i]
+            if c not in alphanum:
+                if c == "\000":
+                    s[i] = "\\000"
+                else:
+                    s[i] = "\\" + c
+        return ''.join(s)
 
 # --------------------------------------------------------------------
 # internals
@@ -218,7 +234,10 @@
 
 def _compile(*key):
     # internal: compile pattern
-    cachekey = (type(key[0]),) + key
+    if isinstance(key[0], bytes):
+        cachekey = (type(key[0]), str8(key[0]), key[1])
+    else:
+        cachekey = (type(key[0]),) + key
     p = _cache.get(cachekey)
     if p is not None:
         return p
@@ -236,12 +255,20 @@
     _cache[cachekey] = p
     return p
 
-def _compile_repl(*key):
+def _compile_repl(repl, pattern):
     # internal: compile replacement pattern
+    if isinstance(repl, bytes):
+        cacherepl = str8(repl)
+    else:
+        cacherepl = repl
+    if isinstance(pattern, bytes):
+        cachepattern = str8(pattern)
+    else:
+        cachepattern = pattern
+    key = (cacherepl, cachepattern)
     p = _cache_repl.get(key)
     if p is not None:
         return p
-    repl, pattern = key
     try:
         p = sre_parse.parse_template(repl, pattern)
     except error as v:
Index: Lib/test/test_re.py
===================================================================
--- Lib/test/test_re.py	(révision 56838)
+++ Lib/test/test_re.py	(copie de travail)
@@ -397,18 +397,32 @@
         self.assertEqual(re.search("\s(b)", " b").group(1), "b")
         self.assertEqual(re.search("a\s", "a ").group(0), "a ")
 
-    def test_re_escape(self):
-        p=""
-        for i in range(0, 256):
-            p = p + chr(i)
-            self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
-                             True)
-            self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
+    def _test_re_escape(self, use_bytes):
+        if use_bytes:
+            p=bytes()
+            for i in range(0, 256):
+                p.append(i)
+                self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
+                                 True)
+                self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
+        else:
+            p=""
+            for i in range(0, 256):
+                p = p + chr(i)
+                self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
+                                 True)
+                self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
 
         pat=re.compile(re.escape(p))
         self.assertEqual(pat.match(p) is not None, True)
         self.assertEqual(pat.match(p).span(), (0,256))
 
+    def test_re_escape_str(self):
+        self._test_re_escape(False)
+
+    def test_re_escape_bytes(self):
+        self._test_re_escape(True)
+
     def test_pickling(self):
         import pickle
         self.pickle_test(pickle)
Index: Lib/sre_compile.py
===================================================================
--- Lib/sre_compile.py	(révision 56838)
+++ Lib/sre_compile.py	(copie de travail)
@@ -472,7 +472,7 @@
     code[skip] = len(code) - skip
 
 def isstring(obj):
-    return isinstance(obj, basestring)
+    return isinstance(obj, basestring) or isinstance(obj, bytes)
 
 def _code(p, flags):
 
Index: Lib/sre_parse.py
===================================================================
--- Lib/sre_parse.py	(révision 56838)
+++ Lib/sre_parse.py	(copie de travail)
@@ -184,6 +184,9 @@
 
 class Tokenizer:
     def __init__(self, string):
+        self.use_bytes = isinstance(string, bytes)
+        if self.use_bytes:
+            string = str(string, "latin-1")
         self.string = string
         self.index = 0
         self.__next()
@@ -701,6 +704,7 @@
     # group references
     s = Tokenizer(source)
     sget = s.get
+    use_bytes = s.use_bytes
     p = []
     a = p.append
     def literal(literal, p=p, pappend=a):
@@ -779,7 +783,10 @@
             groupsappend((i, s))
             # literal[i] is already None
         else:
-            literals[i] = s
+            if use_bytes:
+                literals[i] = bytes(s)
+            else:
+                literals[i] = s
         i = i + 1
     return groups, literals

_______________________________________________
Python-3000 mailing list
[email protected]
http://mail.python.org/mailman/listinfo/python-3000
Unsubscribe: 
http://mail.python.org/mailman/options/python-3000/archive%40mail-archive.com

Re: [Python-3000] bytes regular expression?

Reply via email to