Repository: thrift Updated Branches: refs/heads/master 517aa1491 -> 75d879ab4
THRIFT-2413 Add JSON escaped unicode support for python3. Client: Python Patch: Phongphan Phuttha This closes #686 Project: http://git-wip-us.apache.org/repos/asf/thrift/repo Commit: http://git-wip-us.apache.org/repos/asf/thrift/commit/369d62e5 Tree: http://git-wip-us.apache.org/repos/asf/thrift/tree/369d62e5 Diff: http://git-wip-us.apache.org/repos/asf/thrift/diff/369d62e5 Branch: refs/heads/master Commit: 369d62e5e925654fa6d1ca3bfe5d73023456adb1 Parents: 517aa14 Author: Phongphan Phuttha <[email protected]> Authored: Mon Nov 9 02:05:09 2015 +0700 Committer: Nobuaki Sukegawa <[email protected]> Committed: Sat Nov 14 17:03:57 2015 +0900 ---------------------------------------------------------------------- lib/py/src/protocol/TJSONProtocol.py | 49 +++++++++++++++++++------------ lib/py/test/thrift_json.py | 4 +-- 2 files changed, 32 insertions(+), 21 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/thrift/blob/369d62e5/lib/py/src/protocol/TJSONProtocol.py ---------------------------------------------------------------------- diff --git a/lib/py/src/protocol/TJSONProtocol.py b/lib/py/src/protocol/TJSONProtocol.py index e98f4cf..d210bff 100644 --- a/lib/py/src/protocol/TJSONProtocol.py +++ b/lib/py/src/protocol/TJSONProtocol.py @@ -249,6 +249,21 @@ class TJSONProtocolBase(TProtocolBase): def _isLowSurrogate(self, codeunit): return codeunit >= 0xdc00 and codeunit <= 0xdfff + def _toChar(self, high, low=None): + if not low: + if sys.version_info[0] == 2: + return ("\\u%04x" % high).decode('unicode-escape').encode('utf-8') + else: + return chr(high) + else: + codepoint = (1 << 16) + ((high & 0x3ff) << 10) + codepoint += low & 0x3ff + if sys.version_info[0] == 2: + s = "\\U%08x" % codepoint + return s.decode('unicode-escape').encode('utf-8') + else: + return chr(codepoint) + def readJSONString(self, skipContext): highSurrogate = None string = [] @@ -262,26 +277,22 @@ class TJSONProtocolBase(TProtocolBase): if ord(character) == ESCSEQ0: character = self.reader.read() if ord(character) == ESCSEQ1: - if sys.version_info[0] == 2: - import json - character = self.trans.read(4) - codeunit = int(character, 16) - if self._isHighSurrogate(codeunit): - if highSurrogate: - raise TProtocolException(TProtocolException.INVALID_DATA, - "Expected low surrogate char") - highSurrogate = character - continue - elif self._isLowSurrogate(codeunit): - if not highSurrogate: - raise TProtocolException(TProtocolException.INVALID_DATA, - "Expected high surrogate char") - character = json.JSONDecoder().decode('"\\u%s\\u%s"' % (highSurrogate, character)).encode('utf-8') - highSurrogate = None - else: - character = json.JSONDecoder().decode('"\\u%s"' % character).encode('utf-8') + character = self.trans.read(4).decode('ascii') + codeunit = int(character, 16) + if self._isHighSurrogate(codeunit): + if highSurrogate: + raise TProtocolException(TProtocolException.INVALID_DATA, + "Expected low surrogate char") + highSurrogate = codeunit + continue + elif self._isLowSurrogate(codeunit): + if not highSurrogate: + raise TProtocolException(TProtocolException.INVALID_DATA, + "Expected high surrogate char") + character = self._toChar(highSurrogate, codeunit) + highSurrogate = None else: - character = chr(int(self.trans.read(4))) + character = self._toChar(codeunit) else: if character not in ESCAPE_CHARS: raise TProtocolException(TProtocolException.INVALID_DATA, http://git-wip-us.apache.org/repos/asf/thrift/blob/369d62e5/lib/py/test/thrift_json.py ---------------------------------------------------------------------- diff --git a/lib/py/test/thrift_json.py b/lib/py/test/thrift_json.py index cef8870..6d6c8fa 100644 --- a/lib/py/test/thrift_json.py +++ b/lib/py/test/thrift_json.py @@ -15,8 +15,8 @@ import unittest class TestJSONString(unittest.TestCase): def test_escaped_unicode_string(self): - unicode_json = '"hello \\u0e01\\u0e02\\u0e03\\ud835\\udcab unicode"' - unicode_text = u'hello \u0e01\u0e02\u0e03\U0001D4AB unicode' + unicode_json = b'"hello \\u0e01\\u0e02\\u0e03\\ud835\\udcab\\udb40\\udc70 unicode"' + unicode_text = u'hello \u0e01\u0e02\u0e03\U0001D4AB\U000E0070 unicode' buf = TTransport.TMemoryBuffer(unicode_json) transport = TTransport.TBufferedTransportFactory().getTransport(buf)
