[pypy-commit] pypy unicode-utf8: remove the flag

fijal Thu, 07 Dec 2017 07:45:50 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r93297:db2a8c9fccf1
Date: 2017-12-07 17:44 +0200
http://bitbucket.org/pypy/pypy/changeset/db2a8c9fccf1/


Log:    remove the flag

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1087,8 +1087,11 @@
     def newlist_utf8(self, list_u, is_ascii):
         l_w = [None] * len(list_u)
         for i, item in enumerate(list_u):
-            length, flag = rutf8.check_utf8(item, True)
-            l_w[i] = self.newutf8(item, length, flag)
+            if not is_ascii:
+                length = rutf8.check_utf8(item, True)
+            else:
+                length = len(item)
+            l_w[i] = self.newutf8(item, length)
         return self.newlist(l_w)
 
     def newlist_int(self, list_i):
diff --git a/pypy/interpreter/pyparser/parsestring.py 
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -64,8 +64,8 @@
             r = unicodehelper.decode_raw_unicode_escape(space, substr)
         else:
             r = unicodehelper.decode_unicode_escape(space, substr)
-        v, length, flag = r
-        return space.newutf8(v, length, flag)
+        v, length = r
+        return space.newutf8(v, length)
 
     need_encoding = (encoding is not None and
                      encoding != "utf-8" and encoding != "utf8" and
@@ -74,8 +74,8 @@
     substr = s[ps : q]
     if rawmode or '\\' not in s[ps:]:
         if need_encoding:
-            lgt, flag = unicodehelper.check_utf8_or_raise(space, substr)
-            w_u = space.newutf8(substr, lgt, flag)
+            lgt = unicodehelper.check_utf8_or_raise(space, substr)
+            w_u = space.newutf8(substr, lgt)
             w_v = unicodehelper.encode(space, w_u, encoding)
             return w_v
         else:
@@ -234,8 +234,8 @@
     p = ps
     while p < end and ord(s[p]) & 0x80:
         p += 1
-    lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, p)
-    w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt, flag),
+    lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p)
+    w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt),
                                recode_encoding)
     v = space.bytes_w(w_v)
     return v, p
diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -10,13 +10,13 @@
     return str_decode_utf8(u, True, "strict", None)
 
 def test_decode_utf8():
-    assert decode_utf8("abc") == ("abc", 3, 3, rutf8.FLAG_ASCII)
-    assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1, 
rutf8.FLAG_REGULAR)
-    assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1, 
rutf8.FLAG_HAS_SURROGATES)
-    assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1, 
rutf8.FLAG_HAS_SURROGATES)
+    assert decode_utf8("abc") == ("abc", 3, 3)
+    assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1)
+    assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1)
+    assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1)
     assert decode_utf8("\xed\xa0\x80\xed\xb0\x80") == (
-        "\xed\xa0\x80\xed\xb0\x80", 6, 2, rutf8.FLAG_HAS_SURROGATES)
-    assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1, 
rutf8.FLAG_REGULAR)
+        "\xed\xa0\x80\xed\xb0\x80", 6, 2)
+    assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1)
 
 def test_utf8_encode_ascii():
     assert utf8_encode_ascii("abc", "??", "??") == "abc"
@@ -41,19 +41,19 @@
     assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) == 
u.encode("ascii", "replace")
 
 def test_str_decode_ascii():
-    assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3, 
rutf8.FLAG_ASCII)
+    assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3)
     def eh(errors, encoding, reason, p, start, end):
         lst.append((errors, encoding, p, start, end))
         return u"\u1234\u5678".encode("utf8"), end
     lst = []
     input = "\xe8"
     exp = u"\u1234\u5678".encode("utf8")
-    assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2, 
rutf8.FLAG_REGULAR)
+    assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2)
     assert lst == [("??", "ascii", input, 0, 1)]
     lst = []
     input = "\xe8\xe9abc\xea\xeb"
     assert str_decode_ascii(input, "??", True, eh) == (
-        exp + exp + "abc" + exp + exp, 7, 11, rutf8.FLAG_REGULAR)
+        exp + exp + "abc" + exp + exp, 7, 11)
     assert lst == [("??", "ascii", input, 0, 1),
                    ("??", "ascii", input, 1, 2),
                    ("??", "ascii", input, 5, 6),
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -3,7 +3,6 @@
 from pypy.interpreter.error import OperationError
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib import rutf8
-from rpython.rlib.rutf8 import combine_flags
 from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rstring import StringBuilder
 from pypy.module._codecs import interp_codecs
@@ -26,10 +25,10 @@
     # Fast version of the "strict" errors handler.
     def raise_unicode_exception_encode(errors, encoding, msg, utf8,
                                        startingpos, endingpos):
-        u_len, flag = rutf8.check_utf8(utf8, True)
+        u_len = rutf8.check_utf8(utf8, True)
         raise OperationError(space.w_UnicodeEncodeError,
                              space.newtuple([space.newtext(encoding),
-                                             space.newutf8(utf8, u_len, flag),
+                                             space.newutf8(utf8, u_len),
                                              space.newint(startingpos),
                                              space.newint(endingpos),
                                              space.newtext(msg)]))
@@ -55,18 +54,18 @@
 def decode_unicode_escape(space, string):
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
-    result_utf8, consumed, length, flag = str_decode_unicode_escape(
+    result_utf8, consumed, length = str_decode_unicode_escape(
         string, "strict",
         final=True,
         errorhandler=decode_error_handler(space),
         ud_handler=unicodedata_handler)
-    return result_utf8, length, flag
+    return result_utf8, length
 
 def decode_raw_unicode_escape(space, string):
-    result_utf8, consumed, lgt, flag = str_decode_raw_unicode_escape(
+    result_utf8, consumed, lgt = str_decode_raw_unicode_escape(
         string, "strict",
         final=True, errorhandler=decode_error_handler(space))
-    return result_utf8, lgt, flag
+    return result_utf8, lgt
 
 def check_ascii_or_raise(space, string):
     try:
@@ -83,19 +82,19 @@
     # you still get two surrogate unicode characters in the result.
     # These are the Python2 rules; Python3 differs.
     try:
-        length, flag = rutf8.check_utf8(string, True, start, end)
+        length = rutf8.check_utf8(string, True, start, end)
     except rutf8.CheckError as e:
         # convert position into unicode position
-        lgt, flags = rutf8.check_utf8(string, True, start, stop=e.pos)
+        lgt = rutf8.check_utf8(string, True, start, stop=e.pos)
         decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string,
                                     start + lgt, start + lgt + 1)
         assert False, "unreachable"
-    return length, flag
+    return length
 
 def str_decode_ascii(s, errors, final, errorhandler):
     try:
         rutf8.check_ascii(s)
-        return s, len(s), len(s), rutf8.FLAG_ASCII
+        return s, len(s), len(s)
     except rutf8.CheckError:
         return _str_decode_ascii_slowpath(s, errors, final, errorhandler)
 
@@ -112,13 +111,13 @@
             res.append(ch)
             i += 1
     ress = res.build()
-    lgt, flag = rutf8.check_utf8(ress, True)
-    return ress, len(s), lgt, flag
+    lgt = rutf8.check_utf8(ress, True)
+    return ress, len(s), lgt
 
 def str_decode_latin_1(s, errors, final, errorhandler):
     try:
         rutf8.check_ascii(s)
-        return s, len(s), len(s), rutf8.FLAG_ASCII
+        return s, len(s), len(s)
     except rutf8.CheckError:
         return _str_decode_latin_1_slowpath(s, errors, final, errorhandler)
 
@@ -138,7 +137,7 @@
             res.append_slice(s, start, end)
             i = end
     # cannot be ASCII, cannot have surrogates, I believe
-    return res.build(), len(s), len(s), rutf8.FLAG_REGULAR
+    return res.build(), len(s), len(s)
 
 def utf8_encode_latin_1(s, errors, errorhandler):
     try:
@@ -336,8 +335,7 @@
         res.append(r)
 
     r = res.build()
-    lgt, flag = rutf8.check_utf8(r, True)
-    return r, pos, lgt, flag
+    return r, pos, rutf8.check_utf8(r, True)
 
 hexdigits = "0123456789ABCDEFabcdef"
 
@@ -350,7 +348,7 @@
             endinpos += 1
         res, pos = errorhandler(errors, encoding,
                                  message, s, pos-2, endinpos)
-        size, flag = rutf8.check_utf8(res, True)
+        size = rutf8.check_utf8(res, True)
         builder.append(res)
     else:
         try:
@@ -361,7 +359,7 @@
                 endinpos += 1
             res, pos = errorhandler(errors, encoding,
                                     message, s, pos-2, endinpos)
-            size, flag = rutf8.check_utf8(res, True)
+            size = rutf8.check_utf8(res, True)
             builder.append(res)
         else:
             # when we get here, chr is a 32-bit unicode character
@@ -371,21 +369,19 @@
                 message = "illegal Unicode character"
                 res, pos = errorhandler(errors, encoding,
                                         message, s, pos-2, pos+digits)
-                size, flag = rutf8.check_utf8(res, True)
+                size = rutf8.check_utf8(res, True)
                 builder.append(res)
             else:
-                flag = rutf8.get_flag_from_code(intmask(chr))
                 pos += digits
                 size = 1
 
-    return pos, size, flag
+    return pos, size
 
 def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):
     size = len(s)
     if size == 0:
-        return '', 0, 0, rutf8.FLAG_ASCII
+        return '', 0, 0
 
-    flag = rutf8.FLAG_ASCII
     builder = StringBuilder(size)
     pos = 0
     outsize = 0
@@ -396,7 +392,6 @@
         if ch != '\\':
             if ord(ch) > 0x7F:
                 rutf8.unichr_as_utf8_append(builder, ord(ch))
-                flag = combine_flags(rutf8.FLAG_REGULAR, flag)
             else:
                 builder.append(ch)
             pos += 1
@@ -409,9 +404,8 @@
             message = "\\ at end of string"
             res, pos = errorhandler(errors, "unicodeescape",
                                     message, s, pos-1, size)
-            newsize, newflag = rutf8.check_utf8(res, True)
+            newsize = rutf8.check_utf8(res, True)
             outsize + newsize
-            flag = combine_flags(flag, newflag)
             builder.append(res)
             continue
 
@@ -464,7 +458,6 @@
             outsize += 1
             if x > 0x7F:
                 rutf8.unichr_as_utf8_append(builder, x)
-                flag = combine_flags(rutf8.FLAG_REGULAR, flag)
             else:
                 builder.append(chr(x))
         # hex escapes
@@ -472,27 +465,24 @@
         elif ch == 'x':
             digits = 2
             message = "truncated \\xXX escape"
-            pos, newsize, newflag = hexescape(builder, s, pos, digits,
+            pos, newsize = hexescape(builder, s, pos, digits,
                             "unicodeescape", errorhandler, message, errors)
-            flag = combine_flags(flag, newflag)
             outsize += newsize
 
         # \uXXXX
         elif ch == 'u':
             digits = 4
             message = "truncated \\uXXXX escape"
-            pos, newsize, newflag = hexescape(builder, s, pos, digits,
+            pos, newsize = hexescape(builder, s, pos, digits,
                             "unicodeescape", errorhandler, message, errors)
-            flag = combine_flags(flag, newflag)
             outsize += newsize
 
         #  \UXXXXXXXX
         elif ch == 'U':
             digits = 8
             message = "truncated \\UXXXXXXXX escape"
-            pos, newsize, newflag = hexescape(builder, s, pos, digits,
+            pos, newsize = hexescape(builder, s, pos, digits,
                             "unicodeescape", errorhandler, message, errors)
-            flag = combine_flags(flag, newflag)
             outsize += newsize
 
         # \N{name}
@@ -512,29 +502,25 @@
                     if code < 0:
                         res, pos = errorhandler(errors, "unicodeescape",
                                                 message, s, pos-1, look+1)
-                        newsize, newflag = rutf8.check_utf8(res, True)
-                        flag = combine_flags(flag, newflag)
+                        newsize = rutf8.check_utf8(res, True)
                         outsize += newsize
                         builder.append(res)
                         continue
                     pos = look + 1
                     outsize += 1
-                    flag = combine_flags(flag, rutf8.get_flag_from_code(code))
                     rutf8.unichr_as_utf8_append(builder, code,
                                                 allow_surrogates=True)
                     # xxx 'code' is probably always within range here...
                 else:
                     res, pos = errorhandler(errors, "unicodeescape",
                                             message, s, pos-1, look+1)
-                    newsize, newflag = rutf8.check_utf8(res, True)
-                    flag = combine_flags(flag, newflag)
+                    newsize = rutf8.check_utf8(res, True)
                     outsize += newsize
                     builder.append(res)
             else:
                 res, pos = errorhandler(errors, "unicodeescape",
                                         message, s, pos-1, look+1)
-                newsize, newflag = rutf8.check_utf8(res, True)
-                flag = combine_flags(flag, newflag)
+                newsize = rutf8.check_utf8(res, True)
                 outsize += newsize
                 builder.append(res)
         else:
@@ -542,7 +528,7 @@
             builder.append(ch)
             outsize += 2
 
-    return builder.build(), pos, outsize, flag
+    return builder.build(), pos, outsize
 
 # ____________________________________________________________
 # Raw unicode escape
@@ -551,7 +537,7 @@
                                   errorhandler=None):
     size = len(s)
     if size == 0:
-        return '', 0, 0, rutf8.FLAG_ASCII
+        return '', 0, 0
 
     result = StringBuilder(size)
     pos = 0
@@ -593,8 +579,8 @@
                         "rawunicodeescape", errorhandler, message, errors)
 
     r = result.build()
-    lgt, flag = rutf8.check_utf8(r, True)
-    return r, pos, lgt, flag
+    lgt = rutf8.check_utf8(r, True)
+    return r, pos, lgt
 
 _utf8_encode_unicode_escape = rutf8.make_utf8_escape_function()
 
@@ -729,7 +715,7 @@
                      errorhandler=None):
     size = len(s)
     if size == 0:
-        return '', 0, 0, rutf8.FLAG_ASCII
+        return '', 0, 0
 
     inShift = False
     base64bits = 0
@@ -740,7 +726,6 @@
     result = StringBuilder(size)
     pos = 0
     shiftOutStartPos = 0
-    flag = rutf8.FLAG_ASCII
     startinpos = 0
     while pos < size:
         ch = s[pos]
@@ -766,13 +751,11 @@
                                         (outCh & 0x3FF)) + 0x10000
                             rutf8.unichr_as_utf8_append(result, code)
                             outsize += 1
-                            flag = combine_flags(flag, rutf8.FLAG_REGULAR)
                             surrogate = 0
                             continue
                         else:
                             rutf8.unichr_as_utf8_append(result, surrogate,
                                                         allow_surrogates=True)
-                            flag = rutf8.FLAG_HAS_SURROGATES
                             outsize += 1
                             surrogate = 0
                             # Not done with outCh: falls back to next line
@@ -780,8 +763,6 @@
                         # first surrogate
                         surrogate = outCh
                     else:
-                        flag = combine_flags(flag,
-                                             rutf8.get_flag_from_code(outCh))
                         outsize += 1
                         assert outCh >= 0
                         rutf8.unichr_as_utf8_append(result, outCh, True)
@@ -797,9 +778,8 @@
                         msg = "partial character in shift sequence"
                         res, pos = errorhandler(errors, 'utf7',
                                                 msg, s, pos-1, pos)
-                        reslen, resflags = rutf8.check_utf8(res, True)
+                        reslen = rutf8.check_utf8(res, True)
                         outsize += reslen
-                        flag = combine_flags(flag, resflags)
                         result.append(res)
                         continue
                     else:
@@ -809,15 +789,13 @@
                             msg = "non-zero padding bits in shift sequence"
                             res, pos = errorhandler(errors, 'utf7',
                                                     msg, s, pos-1, pos)
-                            reslen, resflags = rutf8.check_utf8(res, True)
+                            reslen = rutf8.check_utf8(res, True)
                             outsize += reslen
-                            flag = combine_flags(flag, resflags)
                             result.append(res)
                             continue
 
                 if surrogate and _utf7_DECODE_DIRECT(ord(ch)):
                     outsize += 1
-                    flag = rutf8.FLAG_HAS_SURROGATES
                     rutf8.unichr_as_utf8_append(result, surrogate, True)
                 surrogate = 0
 
@@ -849,9 +827,8 @@
             pos += 1
             msg = "unexpected special character"
             res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
-            reslen, resflags = rutf8.check_utf8(res, True)
+            reslen = rutf8.check_utf8(res, True)
             outsize += reslen
-            flag = combine_flags(flag, resflags)
             result.append(res)
 
     # end of string
@@ -864,9 +841,8 @@
             (base64bits > 0 and base64buffer != 0)):
             msg = "unterminated shift sequence"
             res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos, 
pos)
-            reslen, resflags = rutf8.check_utf8(res, True)
+            reslen = rutf8.check_utf8(res, True)
             outsize += reslen
-            flag = combine_flags(flag, resflags)
             result.append(res)
             final_length = result.getlength()
     elif inShift:
@@ -874,7 +850,7 @@
         final_length = shiftOutStartPos # back off output
 
     assert final_length >= 0
-    return result.build()[:final_length], pos, outsize, flag
+    return result.build()[:final_length], pos, outsize
 
 def utf8_encode_utf_7(s, errors, errorhandler):
     size = len(s)
@@ -937,21 +913,21 @@
 
 def str_decode_utf_16(s, errors, final=True,
                       errorhandler=None):
-    result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final,
+    result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
                                                          errorhandler, 
"native")
-    return result, c, lgt, flag
+    return result, c, lgt
 
 def str_decode_utf_16_be(s, errors, final=True,
                         errorhandler=None):
-    result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final,
+    result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
                                                          errorhandler, "big")
-    return result, c, lgt, flag
+    return result, c, lgt
 
 def str_decode_utf_16_le(s, errors, final=True,
                          errorhandler=None):
-    result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final,
+    result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
                                                          errorhandler, 
"little")
-    return result, c, lgt, flag
+    return result, c, lgt
 
 def str_decode_utf_16_helper(s, errors, final=True,
                              errorhandler=None,
@@ -994,7 +970,7 @@
     else:
         bo = 1
     if size == 0:
-        return '', 0, 0, rutf8.FLAG_ASCII, bo
+        return '', 0, 0, bo
     if bo == -1:
         # force little endian
         ihi = 1
@@ -1053,8 +1029,8 @@
                                   s, pos - 2, pos)
             result.append(r)
     r = result.build()
-    lgt, flag = rutf8.check_utf8(r, True)
-    return result.build(), pos, lgt, flag, bo
+    lgt = rutf8.check_utf8(r, True)
+    return result.build(), pos, lgt, bo
 
 def _STORECHAR(result, CH, byteorder):
     hi = chr(((CH) >> 8) & 0xff)
@@ -1143,21 +1119,21 @@
 
 def str_decode_utf_32(s, errors, final=True,
                       errorhandler=None):
-    result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final,
+    result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
                                                          errorhandler, 
"native")
-    return result, c, lgt, flag
+    return result, c, lgt
 
 def str_decode_utf_32_be(s, errors, final=True,
                          errorhandler=None):
-    result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final,
+    result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
                                                          errorhandler, "big")
-    return result, c, lgt, flag
+    return result, c, lgt
 
 def str_decode_utf_32_le(s, errors, final=True,
                          errorhandler=None):
-    result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final,
+    result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
                                                          errorhandler, 
"little")
-    return result, c, lgt, flag
+    return result, c, lgt
 
 BOM32_DIRECT  = intmask(0x0000FEFF)
 BOM32_REVERSE = intmask(0xFFFE0000)
@@ -1203,7 +1179,7 @@
     else:
         bo = 1
     if size == 0:
-        return '', 0, 0, rutf8.FLAG_ASCII, bo
+        return '', 0, 0, bo
     if bo == -1:
         # force little endian
         iorder = [0, 1, 2, 3]
@@ -1238,8 +1214,8 @@
         rutf8.unichr_as_utf8_append(result, ch, allow_surrogates=True)
         pos += 4
     r = result.build()
-    lgt, flag = rutf8.check_utf8(r, True)
-    return r, pos, lgt, flag, bo
+    lgt = rutf8.check_utf8(r, True)
+    return r, pos, lgt, bo
 
 def _STORECHAR32(result, CH, byteorder):
     c0 = chr(((CH) >> 24) & 0xff)
@@ -1325,7 +1301,7 @@
                                 errorhandler=None):
     size = len(s)
     if size == 0:
-        return '', 0, 0, rutf8.FLAG_ASCII
+        return '', 0, 0
 
     unicode_bytes = 4
     if BYTEORDER == "little":
@@ -1362,8 +1338,8 @@
         rutf8.unichr_as_utf8_append(result, intmask(t), allow_surrogates=True)
         pos += unicode_bytes
     r = result.build()
-    lgt, flag = rutf8.check_utf8(r, True)
-    return r, pos, lgt, flag
+    lgt = rutf8.check_utf8(r, True)
+    return r, pos, lgt
 
 def utf8_encode_unicode_internal(s, errors, errorhandler):
     size = len(s)
@@ -1404,7 +1380,7 @@
                                   errorhandler=errorhandler)
     size = len(s)
     if size == 0:
-        return '', 0, 0, rutf8.FLAG_ASCII
+        return '', 0, 0
 
     pos = 0
     result = StringBuilder(size)
@@ -1421,8 +1397,8 @@
         result.append(c)
         pos += 1
     r = result.build()
-    lgt, flag = rutf8.check_utf8(r, True)
-    return r, pos, lgt, flag
+    lgt = rutf8.check_utf8(r, True)
+    return r, pos, lgt
 
 def utf8_encode_charmap(s, errors, errorhandler=None,
                            mapping=None):
diff --git a/pypy/module/__builtin__/operation.py 
b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -26,14 +26,8 @@
     "Return a Unicode string of one character with the given ordinal."
     if code < 0 or code > 0x10FFFF:
         raise oefmt(space.w_ValueError, "unichr() arg out of range")        
-    elif code < 0x80:
-        flag = rutf8.FLAG_ASCII
-    elif 0xD800 <= code <= 0xDFFF:
-        flag = rutf8.FLAG_HAS_SURROGATES
-    else:
-        flag = rutf8.FLAG_REGULAR
     s = rutf8.unichr_as_utf8(code, allow_surrogates=True)
-    return space.newutf8(s, 1, flag)
+    return space.newutf8(s, 1)
 
 def len(space, w_obj):
     "len(object) -> integer\n\nReturn the number of items of a sequence or 
mapping."
diff --git a/pypy/module/_cffi_backend/ctypeprim.py 
b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -183,8 +183,7 @@
             raise oefmt(self.space.w_ValueError,
                         "%s out of range for conversion to unicode: %s",
                         self.name, s)
-        flag = rutf8.get_flag_from_code(intmask(value))
-        return self.space.newutf8(utf8, 1, flag)
+        return self.space.newutf8(utf8, 1)
 
     def string(self, cdataobj, maxlen):
         with cdataobj as ptr:
@@ -215,15 +214,15 @@
 
     def unpack_ptr(self, w_ctypeptr, ptr, length):
         if self.size == 2:
-            utf8, lgt, flag = wchar_helper.utf8_from_char16(ptr, length)
+            utf8, lgt = wchar_helper.utf8_from_char16(ptr, length)
         else:
             try:
-                utf8, lgt, flag = wchar_helper.utf8_from_char32(ptr, length)
+                utf8, lgt = wchar_helper.utf8_from_char32(ptr, length)
             except wchar_helper.OutOfRange as e:
                 raise oefmt(self.space.w_ValueError,
                             "%s out of range for conversion to unicode: %s",
                             self.name, hex(e.ordinal))
-        return self.space.newutf8(utf8, lgt, flag)
+        return self.space.newutf8(utf8, lgt)
 
 
 class W_CTypePrimitiveSigned(W_CTypePrimitive):
diff --git a/pypy/module/_cffi_backend/wchar_helper.py 
b/pypy/module/_cffi_backend/wchar_helper.py
--- a/pypy/module/_cffi_backend/wchar_helper.py
+++ b/pypy/module/_cffi_backend/wchar_helper.py
@@ -19,16 +19,14 @@
     ptr = rffi.cast(rffi.UINTP, ptr)
     u = StringBuilder(length)
     j = 0
-    flag = rutf8.FLAG_ASCII
     while j < length:
         ch = intmask(ptr[j])
         j += 1
-        flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch))
         try:
             rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True)
         except ValueError:
             raise OutOfRange(ch)
-    return u.build(), length, flag
+    return u.build(), length
 
 def utf8_from_char16(ptr, length):
     # 'ptr' is a pointer to 'length' 16-bit integers
@@ -36,7 +34,6 @@
     u = StringBuilder(length)
     j = 0
     result_length = length
-    flag = rutf8.FLAG_ASCII
     while j < length:
         ch = intmask(ptr[j])
         j += 1
@@ -46,9 +43,8 @@
                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
                 j += 1
                 result_length -= 1
-        flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch))
         rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True)
-    return u.build(), result_length, flag
+    return u.build(), result_length
 
 
 @specialize.ll()
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -43,8 +43,8 @@
                 length = len(input)
             else:
                 w_cls = space.w_UnicodeEncodeError
-                length, flag = rutf8.check_utf8(input, allow_surrogates=True)
-                w_input = space.newutf8(input, length, flag)
+                length = rutf8.check_utf8(input, allow_surrogates=True)
+                w_input = space.newutf8(input, length)
             w_exc =  space.call_function(
                 w_cls,
                 space.newtext(encoding),
@@ -192,7 +192,7 @@
 def ignore_errors(space, w_exc):
     check_exception(space, w_exc)
     w_end = space.getattr(w_exc, space.newtext('end'))
-    return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII), w_end])
+    return space.newtuple([space.newutf8('', 0), w_end])
 
 REPLACEMENT = u'\ufffd'.encode('utf8')
 
@@ -203,13 +203,13 @@
     size = space.int_w(w_end) - space.int_w(w_start)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
         text = '?' * size
-        return space.newtuple([space.newutf8(text, size, rutf8.FLAG_ASCII), 
w_end])
+        return space.newtuple([space.newutf8(text, size), w_end])
     elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
         text = REPLACEMENT
-        return space.newtuple([space.newutf8(text, 1, rutf8.FLAG_REGULAR), 
w_end])
+        return space.newtuple([space.newutf8(text, 1), w_end])
     elif space.isinstance_w(w_exc, space.w_UnicodeTranslateError):
         text = REPLACEMENT * size
-        return space.newtuple([space.newutf8(text, size, rutf8.FLAG_REGULAR), 
w_end])
+        return space.newtuple([space.newutf8(text, size), w_end])
     else:
         raise oefmt(space.w_TypeError,
                     "don't know how to handle %T in error callback", w_exc)
@@ -237,8 +237,8 @@
             builder.append(";")
             pos = rutf8.next_codepoint_pos(obj, pos)
         r = builder.build()
-        lgt, flag = rutf8.check_utf8(r, True)
-        return space.newtuple([space.newutf8(r, lgt, flag), w_end])
+        lgt = rutf8.check_utf8(r, True)
+        return space.newtuple([space.newutf8(r, lgt), w_end])
     else:
         raise oefmt(space.w_TypeError,
                     "don't know how to handle %T in error callback", w_exc)
@@ -278,8 +278,8 @@
             builder.append_slice(num, 2, lnum)
             pos = rutf8.next_codepoint_pos(obj, pos)
         r = builder.build()
-        lgt, flag = rutf8.check_utf8(r, True)
-        return space.newtuple([space.newutf8(r, lgt, flag), w_end])
+        lgt = rutf8.check_utf8(r, True)
+        return space.newtuple([space.newutf8(r, lgt), w_end])
     else:
         raise oefmt(space.w_TypeError,
                     "don't know how to handle %T in error callback", w_exc)
@@ -417,9 +417,9 @@
         final = space.is_true(w_final)
         state = space.fromcache(CodecState)
         func = getattr(unicodehelper, rname)
-        result, consumed, length, flag = func(string, errors,
+        result, consumed, length = func(string, errors,
                                               final, 
state.decode_error_handler)
-        return space.newtuple([space.newutf8(result, length, flag),
+        return space.newtuple([space.newutf8(result, length),
                                space.newint(consumed)])
     wrap_decoder.func_name = rname
     globals()[name] = wrap_decoder
@@ -488,14 +488,14 @@
     state = space.fromcache(CodecState)
     # call the fast version for checking
     try:
-        lgt, flag = rutf8.check_utf8(string, allow_surrogates=True)
+        lgt = rutf8.check_utf8(string, allow_surrogates=True)
     except rutf8.CheckError:
-        res, consumed, lgt, flag = unicodehelper.str_decode_utf8(string,
+        res, consumed, lgt = unicodehelper.str_decode_utf8(string,
             errors, final, state.decode_error_handler)
-        return space.newtuple([space.newutf8(res, lgt, flag),
+        return space.newtuple([space.newutf8(res, lgt),
                                space.newint(consumed)])
     else:
-        return space.newtuple([space.newutf8(string, lgt, flag),
+        return space.newtuple([space.newutf8(string, lgt),
                                space.newint(len(string))])
 
 @unwrap_spec(data='bufferstr', errors='text_or_none', byteorder=int,
@@ -516,10 +516,10 @@
     consumed = len(data)
     if final:
         consumed = 0
-    res, consumed, lgt, flag, byteorder = str_decode_utf_16_helper(
+    res, consumed, lgt, byteorder = str_decode_utf_16_helper(
         data, errors, final,
         state.decode_error_handler, byteorder)
-    return space.newtuple([space.newutf8(res, lgt, flag),
+    return space.newtuple([space.newutf8(res, lgt),
                            space.newint(consumed),
                            space.newint(byteorder)])
 
@@ -539,10 +539,10 @@
     consumed = len(data)
     if final:
         consumed = 0
-    res, consumed, lgt, flag, byteorder = str_decode_utf_32_helper(
+    res, consumed, lgt, byteorder = str_decode_utf_32_helper(
         data, errors, final,
         state.decode_error_handler, byteorder)
-    return space.newtuple([space.newutf8(res, lgt, flag),
+    return space.newtuple([space.newutf8(res, lgt),
                            space.newint(consumed),
                            space.newint(byteorder)])
 
@@ -632,7 +632,7 @@
     if errors is None:
         errors = 'strict'
     if len(string) == 0:
-        return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII),
+        return space.newtuple([space.newutf8('', 0),
                                space.newint(0)])
 
     if space.is_none(w_mapping):
@@ -642,9 +642,9 @@
 
     final = True
     state = space.fromcache(CodecState)
-    result, consumed, lgt, flag = unicodehelper.str_decode_charmap(
+    result, consumed, lgt = unicodehelper.str_decode_charmap(
         string, errors, final, state.decode_error_handler, mapping)
-    return space.newtuple([space.newutf8(result, lgt, flag),
+    return space.newtuple([space.newutf8(result, lgt),
                            space.newint(consumed)])
 
 @unwrap_spec(errors='text_or_none')
@@ -708,12 +708,12 @@
 
     unicode_name_handler = state.get_unicodedata_handler(space)
 
-    result, consumed, lgt, flag = unicodehelper.str_decode_unicode_escape(
+    result, consumed, lgt = unicodehelper.str_decode_unicode_escape(
         string, errors,
         final, state.decode_error_handler,
         unicode_name_handler)
 
-    return space.newtuple([space.newutf8(result, lgt, flag), 
space.newint(consumed)])
+    return space.newtuple([space.newutf8(result, lgt), space.newint(consumed)])
 
 # ____________________________________________________________
 # Unicode-internal
@@ -731,15 +731,15 @@
     string = space.readbuf_w(w_string).as_str()
 
     if len(string) == 0:
-        return space.newtuple([space.newutf8('', 0, rutf8.FLAG_ASCII),
+        return space.newtuple([space.newutf8('', 0),
                                space.newint(0)])
 
     final = True
     state = space.fromcache(CodecState)
-    result, consumed, lgt, flag = unicodehelper.str_decode_unicode_internal(
+    result, consumed, lgt = unicodehelper.str_decode_unicode_internal(
         string, errors,
         final, state.decode_error_handler)
-    return space.newtuple([space.newutf8(result, lgt, flag),
+    return space.newtuple([space.newutf8(result, lgt),
                            space.newint(consumed)])
 
 # ____________________________________________________________
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -11,8 +11,8 @@
 from rpython.rlib.rarithmetic import intmask, r_uint, r_ulonglong
 from rpython.rlib.rbigint import rbigint
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import (
-    FLAG_ASCII, check_utf8, next_codepoint_pos, codepoints_in_utf8)
+from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos,
+                                codepoints_in_utf8)
 
 
 STATE_ZERO, STATE_OK, STATE_DETACHED = range(3)
@@ -31,22 +31,22 @@
 
     def __init__(self, space):
         self.w_newlines_dict = {
-            SEEN_CR: space.newutf8("\r", 1, FLAG_ASCII),
-            SEEN_LF: space.newutf8("\n", 1, FLAG_ASCII),
-            SEEN_CRLF: space.newutf8("\r\n", 2, FLAG_ASCII),
+            SEEN_CR: space.newutf8("\r", 1),
+            SEEN_LF: space.newutf8("\n", 1),
+            SEEN_CRLF: space.newutf8("\r\n", 2),
             SEEN_CR | SEEN_LF: space.newtuple(
-                [space.newutf8("\r", 1, FLAG_ASCII),
-                 space.newutf8("\n", 1, FLAG_ASCII)]),
+                [space.newutf8("\r", 1),
+                 space.newutf8("\n", 1)]),
             SEEN_CR | SEEN_CRLF: space.newtuple(
-                [space.newutf8("\r", 1, FLAG_ASCII),
-                 space.newutf8("\r\n", 2, FLAG_ASCII)]),
+                [space.newutf8("\r", 1),
+                 space.newutf8("\r\n", 2)]),
             SEEN_LF | SEEN_CRLF: space.newtuple(
-                [space.newutf8("\n", 1, FLAG_ASCII),
-                 space.newutf8("\r\n", 2, FLAG_ASCII)]),
+                [space.newutf8("\n", 1),
+                 space.newutf8("\r\n", 2)]),
             SEEN_CR | SEEN_LF | SEEN_CRLF: space.newtuple(
-                [space.newutf8("\r", 1, FLAG_ASCII),
-                 space.newutf8("\n", 1, FLAG_ASCII),
-                 space.newutf8("\r\n", 2, FLAG_ASCII)]),
+                [space.newutf8("\r", 1),
+                 space.newutf8("\n", 1),
+                 space.newutf8("\r\n", 2)]),
             }
 
     @unwrap_spec(translate=int)
@@ -98,7 +98,7 @@
                 output_len -= 1
 
         if output_len == 0:
-            return space.newutf8("", 0, FLAG_ASCII)
+            return space.newutf8("", 0)
 
         # Record which newlines are read and do newline translation if
         # desired, all in one pass.
@@ -153,8 +153,8 @@
             output = builder.build()
 
         self.seennl |= seennl
-        lgt, flag = check_utf8(output, True)
-        return space.newutf8(output, lgt, flag)
+        lgt = check_utf8(output, True)
+        return space.newutf8(output, lgt)
 
     def reset_w(self, space):
         self.seennl = 0
diff --git a/pypy/module/_multibytecodec/interp_incremental.py 
b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -66,8 +66,8 @@
         pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
         assert 0 <= pos <= len(object)
         self.pending = object[pos:]
-        lgt, flag = rutf8.get_utf8_length_flag(output)
-        return space.newutf8(output, lgt, flag)
+        lgt = rutf8.get_utf8_length_flag(output)
+        return space.newutf8(output, lgt)
 
 
 @unwrap_spec(errors="text_or_none")
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py 
b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -78,12 +78,11 @@
             space.newtext(e.reason)]))
 
 def wrap_unicodeencodeerror(space, e, input, inputlen, name):
-    _, flag = rutf8.check_utf8(input, True)
     raise OperationError(
         space.w_UnicodeEncodeError,
         space.newtuple([
             space.newtext(name),
-            space.newutf8(input, inputlen, flag),
+            space.newutf8(input, inputlen),
             space.newint(e.start),
             space.newint(e.end),
             space.newtext(e.reason)]))
diff --git a/pypy/module/_pypyjson/interp_decoder.py 
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -295,15 +295,15 @@
         if bits & 0x80:
             # the 8th bit is set, it's an utf8 string
             content_utf8 = self.getslice(start, end)
-            lgt, flag = unicodehelper.check_utf8_or_raise(self.space,
+            lgt = unicodehelper.check_utf8_or_raise(self.space,
                                                           content_utf8)
-            return self.space.newutf8(content_utf8, lgt, flag)
+            return self.space.newutf8(content_utf8, lgt)
         else:
             # ascii only, fast path (ascii is a strict subset of
             # latin1, and we already checked that all the chars are <
             # 128)
             return self.space.newutf8(self.getslice(start, end),
-                                      end - start, rutf8.FLAG_ASCII)
+                                      end - start)
 
     def decode_string_escaped(self, start):
         i = self.pos
@@ -316,10 +316,10 @@
             i += 1
             if ch == '"':
                 content_utf8 = builder.build()
-                lgt, f = unicodehelper.check_utf8_or_raise(self.space,
+                lgt = unicodehelper.check_utf8_or_raise(self.space,
                                                            content_utf8)
                 self.pos = i
-                return self.space.newutf8(content_utf8, lgt, f)
+                return self.space.newutf8(content_utf8, lgt)
             elif ch == '\\':
                 i = self.decode_escape_sequence(i, builder)
             elif ch < '\x20':
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py 
b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -11,7 +11,7 @@
     dec.close()
 
 class FakeSpace(object):
-    def newutf8(self, s, l, f):
+    def newutf8(self, s, l):
         return s
 
 def test_decode_key():
diff --git a/pypy/module/_rawffi/alt/type_converter.py 
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -228,8 +228,7 @@
             return space.newbytes(chr(ucharval))
         elif w_ffitype.is_unichar():
             wcharval = self.get_unichar(w_ffitype)
-            return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1,
-                                 rutf8.get_flag_from_code(intmask(wcharval)))
+            return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1)
         elif w_ffitype.is_double():
             return self._float(w_ffitype)
         elif w_ffitype.is_singlefloat():
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -43,8 +43,8 @@
             return space.newbytes(ctx._string[start:end])
         elif isinstance(ctx, rsre_core.UnicodeMatchContext):
             s = ctx._unicodestr[start:end]
-            lgt, flag = rutf8.check_utf8(s, True)
-            return space.newutf8(s, lgt, flag)
+            lgt = rutf8.check_utf8(s, True)
+            return space.newutf8(s, lgt)
         else:
             # unreachable
             raise SystemError
@@ -341,11 +341,10 @@
             else:
                 assert unicodebuilder is not None
                 return space.newutf8(unicodebuilder.build(),
-                                     unicodebuilder.get_length(),
-                                     unicodebuilder.get_flag()), n
+                                     unicodebuilder.get_length()), n
         else:
             if space.isinstance_w(w_string, space.w_unicode):
-                w_emptystr = space.newutf8('', 0, rutf8.FLAG_ASCII)
+                w_emptystr = space.newutf8('', 0)
             else:
                 w_emptystr = space.newbytes('')
             w_item = space.call_method(w_emptystr, 'join',
@@ -579,8 +578,8 @@
         elif isinstance(ctx, rsre_core.StrMatchContext):
             return space.newbytes(ctx._string)
         elif isinstance(ctx, rsre_core.UnicodeMatchContext):
-            lgt, flag = rutf8.check_utf8(ctx._unicodestr, True)
-            return space.newutf8(ctx._unicodestr, lgt, flag)
+            lgt = rutf8.check_utf8(ctx._unicodestr, True)
+            return space.newutf8(ctx._unicodestr, lgt)
         else:
             raise SystemError
 
diff --git a/pypy/module/_warnings/interp_warnings.py 
b/pypy/module/_warnings/interp_warnings.py
--- a/pypy/module/_warnings/interp_warnings.py
+++ b/pypy/module/_warnings/interp_warnings.py
@@ -214,8 +214,8 @@
         message = "%s:%d: %s: %s\n" % (space.utf8_w(w_filename), lineno,
                                         space.utf8_w(w_name),
                                         space.utf8_w(w_text))
-        lgt, flag = rutf8.check_utf8(message, True)
-        w_message = space.newutf8(message, lgt, flag)
+        lgt = rutf8.check_utf8(message, True)
+        w_message = space.newutf8(message, lgt)
     else:
         w_message = space.newtext(message)
     space.call_method(w_stderr, "write", w_message)
diff --git a/pypy/module/array/interp_array.py 
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1014,8 +1014,7 @@
                 return space.newbytes(item)
             elif mytype.typecode == 'u':
                 code = ord(item)
-                return space.newutf8(rutf8.unichr_as_utf8(code), 1,
-                          rutf8.get_flag_from_code(code))
+                return space.newutf8(rutf8.unichr_as_utf8(code), 1)
             assert 0, "unreachable"
 
         # interface
diff --git a/pypy/module/exceptions/interp_exceptions.py 
b/pypy/module/exceptions/interp_exceptions.py
--- a/pypy/module/exceptions/interp_exceptions.py
+++ b/pypy/module/exceptions/interp_exceptions.py
@@ -78,7 +78,6 @@
 from pypy.interpreter.gateway import interp2app
 from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib import rwin32
-from rpython.rlib.rutf8 import FLAG_ASCII
 
 
 def readwrite_attrproperty_w(name, cls):
@@ -127,7 +126,7 @@
             return space.call_function(space.w_unicode, w_as_str)
         lgt = len(self.args_w)
         if lgt == 0:
-            return space.newutf8("", 0, FLAG_ASCII)
+            return space.newutf8("", 0)
         if lgt == 1:
             return space.call_function(space.w_unicode, self.args_w[0])
         else:
diff --git a/pypy/module/pyexpat/interp_pyexpat.py 
b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -478,8 +478,8 @@
             # I suppose this is a valid utf8, but there is noone to check
             # and noone to catch an error either
             try:
-                lgt, flag = rutf8.check_utf8(s, True)
-                return space.newutf8(s, lgt, flag)
+                lgt = rutf8.check_utf8(s, True)
+                return space.newutf8(s, lgt)
             except rutf8.CheckError:
                 from pypy.interpreter import unicodehelper
                 # get the correct error msg
diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py
--- a/pypy/objspace/fake/objspace.py
+++ b/pypy/objspace/fake/objspace.py
@@ -209,7 +209,7 @@
     def newbytes(self, x):
         return w_some_obj()
 
-    def newutf8(self, x, l, f):
+    def newutf8(self, x, l):
         return w_some_obj()
 
     def new_from_utf8(self, a):
diff --git a/pypy/objspace/std/bytearrayobject.py 
b/pypy/objspace/std/bytearrayobject.py
--- a/pypy/objspace/std/bytearrayobject.py
+++ b/pypy/objspace/std/bytearrayobject.py
@@ -195,11 +195,11 @@
         w_dict = self.getdict(space)
         if w_dict is None:
             w_dict = space.w_None
-        s, _, lgt, flag = str_decode_latin_1(''.join(self.getdata()), 'strict',
+        s, _, lgt = str_decode_latin_1(''.join(self.getdata()), 'strict',
             True, None)
         return space.newtuple([
             space.type(self), space.newtuple([
-                space.newutf8(s, lgt, flag), space.newtext('latin-1')]),
+                space.newutf8(s, lgt), space.newtext('latin-1')]),
             w_dict])
 
     @staticmethod
diff --git a/pypy/objspace/std/dictmultiobject.py 
b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -1197,7 +1197,7 @@
     unerase = staticmethod(unerase)
 
     def wrap(self, unwrapped):
-        return self.space.newutf8(unwrapped, len(unwrapped), rutf8.FLAG_ASCII)
+        return self.space.newutf8(unwrapped, len(unwrapped))
 
     def unwrap(self, wrapped):
         return self.space.utf8_w(wrapped)
@@ -1239,7 +1239,7 @@
     ##     return self.space.newlist_bytes(self.listview_bytes(w_dict))
 
     def wrapkey(space, key):
-        return space.newutf8(key, len(key), rutf8.FLAG_ASCII)
+        return space.newutf8(key, len(key))
 
     ## @jit.look_inside_iff(lambda self, w_dict:
     ##                      w_dict_unrolling_heuristic(w_dict))
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -198,8 +198,8 @@
             if self.w_valuedict is None:
                 raise oefmt(space.w_TypeError, "format requires a mapping")
             if do_unicode:
-                lgt, flag = rutf8.check_utf8(key, True)
-                w_key = space.newutf8(key, lgt, flag)
+                lgt = rutf8.check_utf8(key, True)
+                w_key = space.newutf8(key, lgt)
             else:
                 w_key = space.newbytes(key)
             return space.getitem(self.w_valuedict, w_key)
@@ -513,8 +513,8 @@
     formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict)
     result = formatter.format()
     # this can force strings, not sure if it's a problem or not
-    lgt, flag = rutf8.check_utf8(result, True)
-    return space.newutf8(result, lgt, flag)
+    lgt = rutf8.check_utf8(result, True)
+    return space.newutf8(result, lgt)
 
 def mod_format(space, w_format, w_values, do_unicode=False):
     if space.isinstance_w(w_values, space.w_tuple):
diff --git a/pypy/objspace/std/listobject.py b/pypy/objspace/std/listobject.py
--- a/pypy/objspace/std/listobject.py
+++ b/pypy/objspace/std/listobject.py
@@ -1998,7 +1998,7 @@
 
     def wrap(self, stringval):
         assert stringval is not None
-        return self.space.newutf8(stringval, len(stringval), rutf8.FLAG_ASCII)
+        return self.space.newutf8(stringval, len(stringval))
 
     def unwrap(self, w_string):
         return self.space.utf8_w(w_string)
diff --git a/pypy/objspace/std/marshal_impl.py 
b/pypy/objspace/std/marshal_impl.py
--- a/pypy/objspace/std/marshal_impl.py
+++ b/pypy/objspace/std/marshal_impl.py
@@ -403,8 +403,8 @@
 @unmarshaller(TYPE_UNICODE)
 def unmarshal_unicode(space, u, tc):
     arg = u.get_str()
-    length, flag = unicodehelper.check_utf8_or_raise(space, arg)
-    return space.newutf8(arg, length, flag)
+    length = unicodehelper.check_utf8_or_raise(space, arg)
+    return space.newutf8(arg, length)
 
 @marshaller(W_SetObject)
 def marshal_set(space, w_set, m):
diff --git a/pypy/objspace/std/newformat.py b/pypy/objspace/std/newformat.py
--- a/pypy/objspace/std/newformat.py
+++ b/pypy/objspace/std/newformat.py
@@ -51,8 +51,8 @@
 
         if for_unicode:
             def wrap(self, u):
-                lgt, flag = rutf8.check_utf8(u, True)
-                return self.space.newutf8(u, lgt, flag)
+                lgt = rutf8.check_utf8(u, True)
+                return self.space.newutf8(u, lgt)
         else:
             def wrap(self, s):
                 return self.space.newbytes(s)
@@ -379,8 +379,8 @@
         template = unicode_template_formatter(space,
                                               space.utf8_w(w_string))
         r = template.build(args)
-        lgt, flag = rutf8.check_utf8(r, True)
-        return space.newutf8(r, lgt, flag)
+        lgt = rutf8.check_utf8(r, True)
+        return space.newutf8(r, lgt)
     else:
         template = str_template_formatter(space, space.bytes_w(w_string))
         return space.newbytes(template.build(args))
@@ -416,8 +416,8 @@
 
         if for_unicode:
             def wrap(self, u):
-                lgt, flag = rutf8.check_utf8(u, True)
-                return self.space.newutf8(u, lgt, flag)
+                lgt = rutf8.check_utf8(u, True)
+                return self.space.newutf8(u, lgt)
         else:
             def wrap(self, s):
                 return self.space.newbytes(s)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -165,8 +165,8 @@
             return self.newtext(x)
         if isinstance(x, unicode):
             x = x.encode('utf8')
-            lgt, flag = rutf8.check_utf8(x, True)
-            return self.newutf8(x, lgt, flag)
+            lgt = rutf8.check_utf8(x, True)
+            return self.newutf8(x, lgt)
         if isinstance(x, float):
             return W_FloatObject(x)
         if isinstance(x, W_Root):
@@ -362,16 +362,10 @@
             return self.w_None
         return self.newtext(s)
 
-    def newutf8(self, utf8s, length, flag):
+    def newutf8(self, utf8s, length):
         assert utf8s is not None
         assert isinstance(utf8s, str)
-        return W_UnicodeObject(utf8s, length, flag)
-
-    def new_from_utf8(self, utf8s):
-        # XXX: kill me!
-        assert isinstance(utf8s, str)
-        length, flag = rutf8.check_utf8(utf8s, True)
-        return W_UnicodeObject(utf8s, length, flag)
+        return W_UnicodeObject(utf8s, length)
 
     def newfilename(self, s):
         assert isinstance(s, str) # on pypy3, this decodes the byte string
diff --git a/pypy/objspace/std/setobject.py b/pypy/objspace/std/setobject.py
--- a/pypy/objspace/std/setobject.py
+++ b/pypy/objspace/std/setobject.py
@@ -1291,7 +1291,7 @@
         return self.space.utf8_w(w_item)
 
     def wrap(self, item):
-        return self.space.newutf8(item, len(item), rutf8.FLAG_ASCII)
+        return self.space.newutf8(item, len(item))
 
     def iter(self, w_set):
         return UnicodeIteratorImplementation(self.space, self, w_set)
@@ -1495,7 +1495,7 @@
 
     def next_entry(self):
         for key in self.iterator:
-            return self.space.newutf8(key, len(key), rutf8.FLAG_ASCII)
+            return self.space.newutf8(key, len(key))
         else:
             return None
 
diff --git a/pypy/objspace/std/test/test_index.py 
b/pypy/objspace/std/test/test_index.py
--- a/pypy/objspace/std/test/test_index.py
+++ b/pypy/objspace/std/test/test_index.py
@@ -265,8 +265,7 @@
 class AppTest_UnicodeTestCase(SeqTestCase, StringTestCase):
     def setup_method(self, method):
         SeqTestCase.setup_method(self, method)
-        self.w_seq = self.space.newutf8("this is a test", len("this is a 
test"),
-                                        rutf8.FLAG_ASCII)
+        self.w_seq = self.space.newutf8("this is a test", len("this is a 
test"))
         self.w_const = self.space.appexec([], """(): return unicode""")
 
 
diff --git a/pypy/objspace/std/test/test_lengthhint.py 
b/pypy/objspace/std/test/test_lengthhint.py
--- a/pypy/objspace/std/test/test_lengthhint.py
+++ b/pypy/objspace/std/test/test_lengthhint.py
@@ -74,8 +74,7 @@
         self._test_length_hint(self.space.wrap('P' * self.SIZE))
 
     def test_unicode(self):
-        self._test_length_hint(self.space.newutf8('Y' * self.SIZE, self.SIZE,
-                                                  rutf8.FLAG_ASCII))
+        self._test_length_hint(self.space.newutf8('Y' * self.SIZE, self.SIZE))
 
     def test_tuple(self):
         self._test_length_hint(self.space.wrap(tuple(self.ITEMS)))
diff --git a/pypy/objspace/std/test/test_liststrategies.py 
b/pypy/objspace/std/test/test_liststrategies.py
--- a/pypy/objspace/std/test/test_liststrategies.py
+++ b/pypy/objspace/std/test/test_liststrategies.py
@@ -7,7 +7,6 @@
     IntOrFloatListStrategy)
 from pypy.objspace.std import listobject
 from pypy.objspace.std.test.test_listobject import TestW_ListObject
-from rpython.rlib.rutf8 import FLAG_ASCII
 
 
 class TestW_ListStrategies(TestW_ListObject):
@@ -601,9 +600,9 @@
     def test_unicode(self):
         l1 = W_ListObject(self.space, [self.space.newbytes("eins"), 
self.space.newbytes("zwei")])
         assert isinstance(l1.strategy, BytesListStrategy)
-        l2 = W_ListObject(self.space, [self.space.newutf8("eins", 4, 
FLAG_ASCII), self.space.newutf8("zwei", 4, FLAG_ASCII)])
+        l2 = W_ListObject(self.space, [self.space.newutf8("eins", 4), 
self.space.newutf8("zwei", 4)])
         assert isinstance(l2.strategy, UnicodeListStrategy)
-        l3 = W_ListObject(self.space, [self.space.newbytes("eins"), 
self.space.newutf8("zwei", 4, FLAG_ASCII)])
+        l3 = W_ListObject(self.space, [self.space.newbytes("eins"), 
self.space.newutf8("zwei", 4)])
         assert isinstance(l3.strategy, ObjectListStrategy)
 
     def test_listview_bytes(self):
diff --git a/pypy/objspace/std/test/test_obj.py 
b/pypy/objspace/std/test/test_obj.py
--- a/pypy/objspace/std/test/test_obj.py
+++ b/pypy/objspace/std/test/test_obj.py
@@ -17,7 +17,7 @@
         cls.w_cpython_apptest = space.wrap(option.runappdirect and not 
hasattr(sys, 'pypy_translation_info'))
 
         def w_unwrap_wrap_unicode(space, w_obj):
-            return space.newutf8(space.utf8_w(w_obj), w_obj._length, 
w_obj._get_flag())
+            return space.newutf8(space.utf8_w(w_obj), w_obj._length)
         cls.w_unwrap_wrap_unicode = 
space.wrap(gateway.interp2app(w_unwrap_wrap_unicode))
         def w_unwrap_wrap_str(space, w_obj):
             return space.wrap(space.str_w(w_obj))
diff --git a/pypy/objspace/std/test/test_unicodeobject.py 
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -27,12 +27,12 @@
         assert len(warnings) == 2
 
     def test_listview_unicode(self):
-        w_str = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII)
+        w_str = self.space.newutf8('abcd', 4)
         assert self.space.listview_utf8(w_str) == list("abcd")
 
     def test_new_shortcut(self):
         space = self.space
-        w_uni = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII)
+        w_uni = self.space.newutf8('abcd', 4)
         w_new = space.call_method(
                 space.w_unicode, "__new__", space.w_unicode, w_uni)
         assert w_new is w_uni
@@ -44,8 +44,8 @@
             return   # skip this case
         v = u[start : start + len1]
         space = self.space
-        w_u = space.newutf8(u.encode('utf8'), len(u), rutf8.FLAG_REGULAR)
-        w_v = space.newutf8(v.encode('utf8'), len(v), rutf8.FLAG_REGULAR)
+        w_u = space.newutf8(u.encode('utf8'), len(u))
+        w_v = space.newutf8(v.encode('utf8'), len(v))
         expected = u.find(v, start, start + len1)
         try:
             w_index = space.call_method(w_u, 'index', w_v,
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -35,40 +35,22 @@
     _immutable_fields_ = ['_utf8']
 
     @enforceargs(utf8str=str)
-    def __init__(self, utf8str, length, flag):
+    def __init__(self, utf8str, length):
         assert isinstance(utf8str, str)
         assert length >= 0
         self._utf8 = utf8str
         self._length = length
-        if flag == rutf8.FLAG_ASCII:
-            self._index_storage = rutf8.UTF8_IS_ASCII
-        elif flag == rutf8.FLAG_HAS_SURROGATES:
-            self._index_storage = rutf8.UTF8_HAS_SURROGATES
-        else:
-            assert flag == rutf8.FLAG_REGULAR
-            self._index_storage = rutf8.null_storage()
+        self._index_storage = rutf8.null_storage()
         # XXX checking, remove before any performance measurments
         #     ifdef not_running_in_benchmark
         if not we_are_translated():
-            lgt, flag_check = rutf8.check_utf8(utf8str, True)
+            lgt = rutf8.check_utf8(utf8str, True)
             assert lgt == length
-            if flag_check == rutf8.FLAG_ASCII:
-                # there are cases where we copy part of REULAR that happens
-                # to be ascii
-                assert flag in (rutf8.FLAG_ASCII, rutf8.FLAG_REGULAR)
-            else:
-                assert flag == flag_check
-        # the storage can be one of:
-        # - null, unicode with no surrogates
-        # - rutf8.UTF8_HAS_SURROGATES
-        # - rutf8.UTF8_IS_ASCII
-        # - malloced object, which means it has index, then
-        #   _index_storage.flags determines the kind
 
     @staticmethod
     def from_utf8builder(builder):
         return W_UnicodeObject(
-            builder.build(), builder.get_length(), builder.get_flag())
+            builder.build(), builder.get_length())
 
     def __repr__(self):
         """representation for debugging purposes"""
@@ -108,8 +90,6 @@
         return space.text_w(space.str(self))
 
     def utf8_w(self, space):
-        if self._has_surrogates():
-            return rutf8.reencode_utf8_with_surrogates(self._utf8)
         return self._utf8
 
     def readbuf_w(self, space):
@@ -245,8 +225,7 @@
 
         assert isinstance(w_value, W_UnicodeObject)
         w_newobj = space.allocate_instance(W_UnicodeObject, w_unicodetype)
-        W_UnicodeObject.__init__(w_newobj, w_value._utf8, w_value._length,
-                                 w_value._get_flag())
+        W_UnicodeObject.__init__(w_newobj, w_value._utf8, w_value._length)
         if w_value._index_storage:
             # copy the storage if it's there
             w_newobj._index_storage = w_value._index_storage
@@ -393,8 +372,7 @@
                 elif space.isinstance_w(w_newval, space.w_int):
                     codepoint = space.int_w(w_newval)
                 elif isinstance(w_newval, W_UnicodeObject):
-                    builder.append_utf8(
-                        w_newval._utf8, w_newval._length, w_newval._get_flag())
+                    builder.append_utf8(w_newval._utf8, w_newval._length)
                     continue
                 else:
                     raise oefmt(space.w_TypeError,
@@ -481,16 +459,16 @@
             newlen += dist
             oldtoken = token
 
-        return W_UnicodeObject(expanded, newlen, self._get_flag())
+        return W_UnicodeObject(expanded, newlen)
 
     _StringMethods_descr_join = descr_join
     def descr_join(self, space, w_list):
         l = space.listview_utf8(w_list)
         if l is not None and self.is_ascii():
             if len(l) == 1:
-                return space.newutf8(l[0], len(l[0]), rutf8.FLAG_ASCII)
+                return space.newutf8(l[0], len(l[0]))
             s = self._utf8.join(l)
-            return space.newutf8(s, len(s), rutf8.FLAG_ASCII)
+            return space.newutf8(s, len(s))
         return self._StringMethods_descr_join(space, w_list)
 
     def _join_return_one(self, space, w_obj):
@@ -584,13 +562,6 @@
             return True
         return endswith(value, prefix, start, end)
 
-    def _get_flag(self):
-        if self.is_ascii():
-            return rutf8.FLAG_ASCII
-        elif self._has_surrogates():
-            return rutf8.FLAG_HAS_SURROGATES
-        return rutf8.FLAG_REGULAR
-
     def descr_add(self, space, w_other):
         try:
             w_other = self.convert_arg_to_w_unicode(space, w_other)
@@ -598,9 +569,8 @@
             if e.match(space, space.w_TypeError):
                 return space.w_NotImplemented
             raise
-        flag = rutf8.combine_flags(self._get_flag(), w_other._get_flag())
         return W_UnicodeObject(self._utf8 + w_other._utf8,
-                               self._len() + w_other._len(), flag)
+                               self._len() + w_other._len())
 
     @jit.look_inside_iff(lambda self, space, list_w, size:
                          jit.loop_unrolling_heuristic(list_w, size))
@@ -610,7 +580,6 @@
 
         prealloc_size = len(value) * (size - 1)
         unwrapped = newlist_hint(size)
-        flag = self._get_flag()
         for i in range(size):
             w_s = list_w[i]
             if not (space.isinstance_w(w_s, space.w_bytes) or
@@ -621,7 +590,6 @@
             # XXX Maybe the extra copy here is okay? It was basically going to
             #     happen anyway, what with being placed into the builder
             w_u = self.convert_arg_to_w_unicode(space, w_s)
-            flag = rutf8.combine_flags(flag, w_u._get_flag())
             unwrapped.append(w_u._utf8)
             lgt += w_u._length
             prealloc_size += len(unwrapped[i])
@@ -631,7 +599,7 @@
             if value and i != 0:
                 sb.append(value)
             sb.append(unwrapped[i])
-        return W_UnicodeObject(sb.build(), lgt, flag)
+        return W_UnicodeObject(sb.build(), lgt)
 
     @unwrap_spec(keepends=bool)
     def descr_splitlines(self, space, keepends=False):
@@ -660,8 +628,7 @@
                     lgt += line_end_chars
             assert eol >= 0
             assert sol >= 0
-            # XXX we can do better with flags here, if we want to
-            strs_w.append(W_UnicodeObject(value[sol:eol], lgt, 
self._get_flag()))
+            strs_w.append(W_UnicodeObject(value[sol:eol], lgt))
         return space.newlist(strs_w)
 
     def descr_upper(self, space):
@@ -675,11 +642,11 @@
     def descr_zfill(self, space, width):
         selfval = self._utf8
         if len(selfval) == 0:
-            return W_UnicodeObject('0' * width, width, rutf8.FLAG_ASCII)
+            return W_UnicodeObject('0' * width, width)
         num_zeros = width - self._len()
         if num_zeros <= 0:
             # cannot return self, in case it is a subclass of str
-            return W_UnicodeObject(selfval, self._len(), self._get_flag())
+            return W_UnicodeObject(selfval, self._len())
         builder = StringBuilder(num_zeros + len(selfval))
         if len(selfval) > 0 and (selfval[0] == '+' or selfval[0] == '-'):
             # copy sign to first position
@@ -689,7 +656,7 @@
             start = 0
         builder.append_multiple_char('0', num_zeros)
         builder.append_slice(selfval, start, len(selfval))
-        return W_UnicodeObject(builder.build(), width, self._get_flag())
+        return W_UnicodeObject(builder.build(), width)
 
     @unwrap_spec(maxsplit=int)
     def descr_split(self, space, w_sep=None, maxsplit=-1):
@@ -748,7 +715,7 @@
                 break
             i += 1
             byte_pos = self._index_to_byte(start + i * step)
-        return W_UnicodeObject(builder.build(), sl, self._get_flag())
+        return W_UnicodeObject(builder.build(), sl)
 
     def descr_getslice(self, space, w_start, w_stop):
         start, stop = normalize_simple_slice(
@@ -765,8 +732,7 @@
         assert stop >= 0
         byte_start = self._index_to_byte(start)
         byte_stop = self._index_to_byte(stop)
-        return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start,
-                               self._get_flag())
+        return W_UnicodeObject(self._utf8[byte_start:byte_stop], stop - start)
 
     def descr_capitalize(self, space):
         value = self._utf8
@@ -800,7 +766,7 @@
             centered = value
             d = 0
 
-        return W_UnicodeObject(centered, self._len() + d, self._get_flag())
+        return W_UnicodeObject(centered, self._len() + d)
 
     def descr_count(self, space, w_sub, w_start=None, w_end=None):
         value = self._utf8
@@ -828,9 +794,9 @@
         else:
             lgt, _ = rutf8.check_utf8(value, True, stop=pos)
             return space.newtuple(
-                [W_UnicodeObject(value[0:pos], lgt, self._get_flag()), w_sub,
+                [W_UnicodeObject(value[0:pos], lgt), w_sub,
                  W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
-                    self._len() - lgt - sublen, self._get_flag())])
+                    self._len() - lgt - sublen)])
 
     def descr_rpartition(self, space, w_sub):
         value = self._utf8
@@ -846,9 +812,9 @@
         else:
             lgt, _ = rutf8.check_utf8(value, True, stop=pos)
             return space.newtuple(
-                [W_UnicodeObject(value[0:pos], lgt, self._get_flag()), w_sub,
+                [W_UnicodeObject(value[0:pos], lgt), w_sub,
                  W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
-                    self._len() - lgt - sublen, self._get_flag())])
+                    self._len() - lgt - sublen)])
 
     @unwrap_spec(count=int)
     def descr_replace(self, space, w_old, w_new, count=-1):
@@ -866,9 +832,8 @@
         except OverflowError:
             raise oefmt(space.w_OverflowError, "replace string is too long")
 
-        flag = rutf8.combine_flags(self._get_flag(), w_by._get_flag())
         newlength = self._length + replacements * (w_by._length - 
w_sub._length)
-        return W_UnicodeObject(res, newlength, flag)
+        return W_UnicodeObject(res, newlength)
 
     def descr_mul(self, space, w_times):
         try:
@@ -880,29 +845,19 @@
         if times <= 0:
             return self._empty()
         if len(self._utf8) == 1:
-            return W_UnicodeObject(self._utf8[0] * times, times,
-                                   self._get_flag())
-        return W_UnicodeObject(self._utf8 * times, times * self._len(),
-                               self._get_flag())
+            return W_UnicodeObject(self._utf8[0] * times, times)
+        return W_UnicodeObject(self._utf8 * times, times * self._len())
 
     descr_rmul = descr_mul
 
     def _get_index_storage(self):
         # XXX write the correct jit.elidable
-        condition = (self._index_storage == rutf8.null_storage() or
-                     not bool(self._index_storage.contents))
-        if condition:
+        if self._index_storage == rutf8.null_storage():
             storage = rutf8.create_utf8_index_storage(self._utf8, self._length)
         else:
             storage = self._index_storage
         if not jit.isconstant(self):
-            prev_storage = self._index_storage
             self._index_storage = storage
-            if prev_storage == rutf8.UTF8_HAS_SURROGATES:
-                flag = rutf8.FLAG_HAS_SURROGATES
-            else:
-                flag = rutf8.FLAG_REGULAR
-            self._index_storage.flag = flag
         return storage
 
     def _getitem_result(self, space, index):
@@ -912,15 +867,15 @@
             raise oefmt(space.w_IndexError, "string index out of range")
         start = self._index_to_byte(index)
         end = rutf8.next_codepoint_pos(self._utf8, start)
-        return W_UnicodeObject(self._utf8[start:end], 1, self._get_flag())
+        return W_UnicodeObject(self._utf8[start:end], 1)
 
     def is_ascii(self):
-        return self._index_storage is rutf8.UTF8_IS_ASCII
+        return self._length == len(self._utf8)
 
     def _has_surrogates(self):
-        return (self._index_storage is rutf8.UTF8_HAS_SURROGATES or
-                (bool(self._index_storage) and
-                 self._index_storage.flag == rutf8.FLAG_HAS_SURROGATES))
+        if self.is_ascii():
+            return False
+        return rutf8.has_surrogates(self._utf8)
 
     def _index_to_byte(self, index):
         if self.is_ascii():
@@ -988,7 +943,6 @@
         if w_fillchar._len() != 1:
             raise oefmt(space.w_TypeError,
                         "rjust() argument 2 must be a single character")
-        flag = rutf8.combine_flags(self._get_flag(), w_fillchar._get_flag())
         d = width - lgt
         if d > 0:
             if len(w_fillchar._utf8) == 1:
@@ -996,9 +950,9 @@
                 value = d * w_fillchar._utf8[0] + value
             else:
                 value = d * w_fillchar._utf8 + value
-            return W_UnicodeObject(value, width, flag)
+            return W_UnicodeObject(value, width)
 
-        return W_UnicodeObject(value, lgt, flag)
+        return W_UnicodeObject(value, lgt)
 
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
     def descr_ljust(self, space, width, w_fillchar):
@@ -1007,7 +961,6 @@
         if w_fillchar._len() != 1:
             raise oefmt(space.w_TypeError,
                         "ljust() argument 2 must be a single character")
-        flag = rutf8.combine_flags(self._get_flag(), w_fillchar._get_flag())
         d = width - self._len()
         if d > 0:
             if len(w_fillchar._utf8) == 1:
@@ -1015,9 +968,9 @@
                 value = value + d * w_fillchar._utf8[0]
             else:
                 value = value + d * w_fillchar._utf8
-            return W_UnicodeObject(value, width, flag)
+            return W_UnicodeObject(value, width)
 
-        return W_UnicodeObject(value, self._len(), flag)
+        return W_UnicodeObject(value, self._len())
 
     def _utf8_sliced(self, start, stop, lgt):
         assert start >= 0
@@ -1025,7 +978,7 @@
         #if start == 0 and stop == len(s) and space.is_w(space.type(orig_obj),
         #                                                space.w_bytes):
         #    return orig_obj
-        return W_UnicodeObject(self._utf8[start:stop], lgt, self._get_flag())
+        return W_UnicodeObject(self._utf8[start:stop], lgt)
 
     def _strip_none(self, space, left, right):
         "internal function called by str_xstrip methods"
@@ -1073,7 +1026,7 @@
         return self._utf8_sliced(lpos, rpos, lgt)
 
     def descr_getnewargs(self, space):
-        return space.newtuple([W_UnicodeObject(self._utf8, self._length, 
self._get_flag())])
+        return space.newtuple([W_UnicodeObject(self._utf8, self._length)])
 
     _starts_ends_unicode = True
 
@@ -1158,11 +1111,11 @@
         if encoding == 'ascii':
             s = space.charbuf_w(w_obj)
             unicodehelper.check_ascii_or_raise(space, s)
-            return space.newutf8(s, len(s), rutf8.FLAG_ASCII)
+            return space.newutf8(s, len(s))
         if encoding == 'utf-8' or encoding == 'utf8':
             s = space.charbuf_w(w_obj)
-            lgt, flag = unicodehelper.check_utf8_or_raise(space, s)
-            return space.newutf8(s, lgt, flag)
+            lgt = unicodehelper.check_utf8_or_raise(space, s)
+            return space.newutf8(s, lgt)
     w_codecs = space.getbuiltinmodule("_codecs")
     w_decode = space.getattr(w_codecs, space.newtext("decode"))
     if errors is None:
@@ -1217,7 +1170,7 @@
         return unicode_from_encoded_object(space, w_bytes, encoding, "strict")
     s = space.bytes_w(w_bytes)
     unicodehelper.check_ascii_or_raise(space, s)
-    return W_UnicodeObject(s, len(s), rutf8.FLAG_ASCII)
+    return W_UnicodeObject(s, len(s))
 
 
 class UnicodeDocstrings:
@@ -1764,7 +1717,7 @@
     return [s for s in value]
 
 
-W_UnicodeObject.EMPTY = W_UnicodeObject('', 0, rutf8.FLAG_ASCII)
+W_UnicodeObject.EMPTY = W_UnicodeObject('', 0)
 
 
 # Helper for converting int/long
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: remove the flag

Reply via email to