[pypy-commit] pypy unicode-utf8-py3: fix test

2018-11-25 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95361:fad3f16aa899
Date: 2018-11-20 22:36 -0800
http://bitbucket.org/pypy/pypy/changeset/fad3f16aa899/

Log:fix test

diff --git a/pypy/module/cpyext/test/test_unicodeobject.py 
b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -689,7 +689,7 @@
 with raises_w(space, TypeError):
 PyUnicode_FromEncodedObject(
 space, space.wrap(u_text), null_charp, None)
-assert space.unicode_w(PyUnicode_FromEncodedObject(
+assert space.text_w(PyUnicode_FromEncodedObject(
 space, space.newbytes(s_text), null_charp, None)) == u_text
 rffi.free_charp(b_text)
 
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: add more tests

2018-11-25 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95362:a9ecac2f678b
Date: 2018-11-23 08:59 -0600
http://bitbucket.org/pypy/pypy/changeset/a9ecac2f678b/

Log:add more tests

diff --git a/pypy/module/_multibytecodec/test/test_app_incremental.py 
b/pypy/module/_multibytecodec/test/test_app_incremental.py
--- a/pypy/module/_multibytecodec/test/test_app_incremental.py
+++ b/pypy/module/_multibytecodec/test/test_app_incremental.py
@@ -1,5 +1,6 @@
+import os
 class AppTestClasses:
-spaceconfig = dict(usemodules=['_multibytecodec'])
+spaceconfig = dict(usemodules=['_multibytecodec', '_codecs', '_io'])
 
 def setup_class(cls):
 cls.w_IncrementalHzDecoder = cls.space.appexec([], """():
@@ -29,6 +30,7 @@
 
 return IncrementalBig5hkscsEncoder
 """)
+cls.w_myfile = cls.space.wrap(os.path.dirname(__file__))
 
 def test_decode_hz(self):
 d = self.IncrementalHzDecoder()
@@ -170,3 +172,27 @@
 assert r == b'\x88f'
 r = e.encode('\u0304')
 assert r == b'\x88b'
+
+def test_incremental_big5hkscs(self):
+import _codecs, _io
+with open(self.myfile + '/big5hkscs.txt', 'rb') as fid:
+uni_str =  fid.read()
+with open(self.myfile + '/big5hkscs-utf8.txt', 'rb') as fid:
+utf8str =  fid.read()
+UTF8Reader = _codecs.lookup('utf-8').streamreader
+for sizehint in [None] + list(range(1, 33)) + \
+[64, 128, 256, 512, 1024]:
+istream = UTF8Reader(_io.BytesIO(utf8str))
+ostream = _io.BytesIO()
+encoder = self.IncrementalBig5hkscsEncoder()
+while 1:
+if sizehint is not None:
+data = istream.read(sizehint)
+else:
+data = istream.read()
+
+if not data:
+break
+e = encoder.encode(data)
+ostream.write(e)
+assert ostream.getvalue() == uni_str
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py 
b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -21,10 +21,10 @@
 def test_decode_hz():
 # stateful
 c = getcodec("hz")
-u = decode(c, "~{abc}")
-assert u == u'\u5f95\u6cef'.encode('utf8')
+utf8 = decode(c, "~{abc}")
+assert utf8.decode('utf8') == u'\u5f95\u6cef'
 u = decode(c, "~{")
-assert u == ''
+assert u == u''
 
 def test_decodeex_hz():
 c = getcodec("hz")
@@ -85,13 +85,13 @@
 
 def test_decode_hz_ignore():
 c = getcodec("hz")
-u = decode(c, 'def~{}abc', 'ignore')
-assert u == u'def\u5fcf'.encode('utf8')
+utf8 = decode(c, 'def~{}abc', 'ignore')
+assert utf8.decode('utf8') == u'def\u5f95'
 
 def test_decode_hz_replace():
 c = getcodec("hz")
-u = decode(c, 'def~{}abc', 'replace')
-assert u == u'def\ufffd\u5fcf'.encode('utf8')
+utf8 = decode(c, 'def~{}abc', 'replace')
+assert utf8.decode('utf8') == u'def\ufffd\u5f95\ufffd'
 
 def test_encode_hz():
 c = getcodec("hz")
@@ -130,3 +130,4 @@
 return u'\xc3'.encode('utf8'), endingpos
 s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'foo', errorhandler)
 assert '\xc3' in s
+
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: return consumed, not unicode lgt

2018-11-25 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95365:5ff7a09cd178
Date: 2018-11-24 00:02 -0600
http://bitbucket.org/pypy/pypy/changeset/5ff7a09cd178/

Log:return consumed, not unicode lgt

diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -754,7 +754,7 @@
 res, lgt, pos = unicodehelper.str_decode_utf8(string,
 errors, final, state.decode_error_handler)
 return space.newtuple([space.newutf8(res, lgt),
-   space.newint(lgt)])
+   space.newint(pos)])
 else:
 return space.newtuple([space.newutf8(string, lgt),
space.newint(len(string))])
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: fix wrong unicode length

2018-11-25 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95363:a325699736bc
Date: 2018-11-23 08:59 -0600
http://bitbucket.org/pypy/pypy/changeset/a325699736bc/

Log:fix wrong unicode length

diff --git a/pypy/module/_multibytecodec/c_codecs.py 
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -269,7 +269,8 @@
 unicodedata, start, end)
 if rettype == 'u':
 codec = pypy_cjk_enc_getcodec(encodebuf)
-replace = encode(codec, replace, end - start)
+lgt = rutf8.check_utf8(replace, False)
+replace = encode(codec, replace, lgt)
 lgt = len(replace)
 with rffi.scoped_nonmovingbuffer(replace) as inbuf:
 r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, lgt, end)
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: improve and add a test

2018-11-25 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95366:2d33f43487c5
Date: 2018-11-25 12:30 -0600
http://bitbucket.org/pypy/pypy/changeset/2d33f43487c5/

Log:improve and add a test

diff --git a/pypy/module/_locale/test/test_locale.py 
b/pypy/module/_locale/test/test_locale.py
--- a/pypy/module/_locale/test/test_locale.py
+++ b/pypy/module/_locale/test/test_locale.py
@@ -20,6 +20,7 @@
 # check whether used locales are installed, otherwise the tests will
 # fail
 current = _locale.setlocale(_locale.LC_ALL)
+cls.oldlocale = current
 try:
 try:
 # some systems are only UTF-8 oriented
@@ -45,6 +46,12 @@
 finally:
 _locale.setlocale(_locale.LC_ALL, current)
 
+def teardown_class(cls):
+import _locale
+_locale.setlocale(_locale.LC_ALL, cls.oldlocale)
+
+
+
 def test_import(self):
 import _locale
 assert _locale
@@ -299,3 +306,36 @@
 assert lang is None or isinstance(lang, str)
 assert encoding.startswith('cp')
 
+def test_lc_numeric_basic(self):
+from _locale import (setlocale, nl_langinfo, Error, LC_NUMERIC,
+ LC_CTYPE, RADIXCHAR, THOUSEP, localeconv)
+# Test nl_langinfo against localeconv
+candidate_locales = ['es_UY', 'fr_FR', 'fi_FI', 'es_CO', 'pt_PT', 
'it_IT',
+'et_EE', 'es_PY', 'no_NO', 'nl_NL', 'lv_LV', 'el_GR', 'be_BY', 
'fr_BE',
+'ro_RO', 'ru_UA', 'ru_RU', 'es_VE', 'ca_ES', 'se_NO', 'es_EC', 
'id_ID',
+'ka_GE', 'es_CL', 'wa_BE', 'hu_HU', 'lt_LT', 'sl_SI', 'hr_HR', 
'es_AR',
+'es_ES', 'oc_FR', 'gl_ES', 'bg_BG', 'is_IS', 'mk_MK', 'de_AT', 
'pt_BR',
+'da_DK', 'nn_NO', 'cs_CZ', 'de_LU', 'es_BO', 'sq_AL', 'sk_SK', 
'fr_CH',
+'de_DE', 'sr_YU', 'br_FR', 'nl_BE', 'sv_FI', 'pl_PL', 'fr_CA', 
'fo_FO',
+'bs_BA', 'fr_LU', 'kl_GL', 'fa_IR', 'de_BE', 'sv_SE', 'it_CH', 
'uk_UA',
+'eu_ES', 'vi_VN', 'af_ZA', 'nb_NO', 'en_DK', 'tg_TJ', 'ps_AF', 
'en_US',
+'fr_FR.ISO8859-1', 'fr_FR.UTF-8', 'fr_FR.ISO8859-15@euro',
+'ru_RU.KOI8-R', 'ko_KR.eucKR']
+
+tested = False
+for loc in candidate_locales:
+try:
+setlocale(LC_NUMERIC, loc)
+setlocale(LC_CTYPE, loc)
+except Error:
+continue
+for li, lc in ((RADIXCHAR, "decimal_point"),
+(THOUSEP, "thousands_sep")):
+nl_radixchar = nl_langinfo(li)
+li_radixchar = localeconv()[lc]
+try:
+set_locale = setlocale(LC_NUMERIC)
+except Error:
+set_locale = ""
+assert nl_radixchar == li_radixchar, ("nl_langinfo != 
localeconv "
+"(set to %s, using %s)" % ( loc, set_locale))
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: add missing test files

2018-11-25 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95364:2a5ae5b4e029
Date: 2018-11-23 09:26 -0600
http://bitbucket.org/pypy/pypy/changeset/2a5ae5b4e029/

Log:add missing test files

diff --git a/pypy/module/_multibytecodec/test/big5hkscs-utf8.txt 
b/pypy/module/_multibytecodec/test/big5hkscs-utf8.txt
new file mode 100644
--- /dev/null
+++ b/pypy/module/_multibytecodec/test/big5hkscs-utf8.txt
@@ -0,0 +1,2 @@
+𠄌Ě鵮罓洆
+ÊÊ̄ê êê̄
diff --git a/pypy/module/_multibytecodec/test/big5hkscs.txt 
b/pypy/module/_multibytecodec/test/big5hkscs.txt
new file mode 100644
--- /dev/null
+++ b/pypy/module/_multibytecodec/test/big5hkscs.txt
@@ -0,0 +1,2 @@
+�E�\�s�ڍ�
+�f�b�� 
diff --git a/pypy/module/_multibytecodec/test/test_multibtye_codecs.py 
b/pypy/module/_multibytecodec/test/test_multibtye_codecs.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_multibytecodec/test/test_multibtye_codecs.py
@@ -0,0 +1,64 @@
+import os
+
+class AppTestPartialEvaluation:
+spaceconfig = dict(usemodules=['_multibytecodec', '_codecs'])
+
+def setup_class(cls):
+cls.w_myfile = cls.space.wrap(os.path.dirname(__file__))
+
+def test_callback_None_index(self):
+import _multibytecodec, _codecs
+codec = _multibytecodec.__getcodec('cp932')
+def myreplace(exc):
+return ('x', None)
+_codecs.register_error("test.cjktest", myreplace)
+raises(TypeError, codec.encode, '\udeee', 'test.cjktest')
+
+def test_callback_backward_index(self):
+import _multibytecodec, _codecs
+codec = _multibytecodec.__getcodec('cp932')
+def myreplace(exc):
+if myreplace.limit > 0:
+myreplace.limit -= 1
+return ('REPLACED', 0)
+else:
+return ('TERMINAL', exc.end)
+myreplace.limit = 3
+_codecs.register_error("test.cjktest", myreplace)
+assert (codec.encode('abcd' + '\udeee' + 'efgh', 'test.cjktest') == 
+(b'abcdREPLACEDabcdREPLACEDabcdREPLACEDabcdTERMINALefgh', 9))
+
+def test_callback_forward_index(self):
+import _multibytecodec, _codecs
+codec = _multibytecodec.__getcodec('cp932')
+def myreplace(exc):
+return ('REPLACED', exc.end + 2)
+_codecs.register_error("test.cjktest", myreplace)
+assert (codec.encode('abcd' + '\udeee' + 'efgh', 'test.cjktest') == 
+ (b'abcdREPLACEDgh', 9))
+
+def _test_incrementalencoder(self):
+import _multibytecodec, _codecs, _io
+with open(self.myfile + '/shift_jis.txt', 'rb') as fid:
+uni_str =  fid.read()
+with open(self.myfile + '/shift_jis-utf8.txt', 'rb') as fid:
+utf8str =  fid.read()
+UTF8Reader = _codecs.lookup('utf-8').streamreader
+for sizehint in [None] + list(range(1, 33)) + \
+[64, 128, 256, 512, 1024]:
+istream = UTF8Reader(_io.BytesIO(utf8str))
+ostream = _io.BytesIO()
+codec = _multibytecodec.__getcodec('cp932')
+print(dir(codec))
+encoder = codec.incrementalencoder()
+while 1:
+if sizehint is not None:
+data = istream.read(sizehint)
+else:
+data = istream.read()
+
+if not data:
+break
+e = encoder.encode(data)
+ostream.write(e)
+assert ostream.getvalue() == uni_str
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-py3: UnicodeListStrategy can hold utf8, not just ascii

2018-11-25 Thread mattip
Author: Matti Picus 
Branch: unicode-utf8-py3
Changeset: r95367:a841b6df8847
Date: 2018-11-25 12:31 -0600
http://bitbucket.org/pypy/pypy/changeset/a841b6df8847/

Log:UnicodeListStrategy can hold utf8, not just ascii

diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -98,7 +98,6 @@
 return self._utf8
 
 def listview_utf8(self):
-assert self.is_ascii()
 return _create_list_from_unicode(self._utf8)
 
 def ord(self, space):
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit