Repository: thrift Updated Branches: refs/heads/master f26488490 -> 7f01e2a8f
THRIFT-2413: UTF-8 sent by PHP as JSON is not understood by TJsonProtocol Client: Python Patch: Phongphan Phuttha This patch allows readJSONString to decode escaped unicode string including encoded surrogate pair. This closes #673 Project: http://git-wip-us.apache.org/repos/asf/thrift/repo Commit: http://git-wip-us.apache.org/repos/asf/thrift/commit/7f01e2a8 Tree: http://git-wip-us.apache.org/repos/asf/thrift/tree/7f01e2a8 Diff: http://git-wip-us.apache.org/repos/asf/thrift/diff/7f01e2a8 Branch: refs/heads/master Commit: 7f01e2a8f869d8622bc56e7584cce98865fa8b0f Parents: f264884 Author: Phongphan Phuttha <[email protected]> Authored: Fri Nov 6 15:46:50 2015 +0700 Committer: Nobuaki Sukegawa <[email protected]> Committed: Mon Nov 9 02:34:28 2015 +0900 ---------------------------------------------------------------------- lib/py/src/protocol/TJSONProtocol.py | 32 ++++++++++++++++++++++++++++++- lib/py/test/thrift_json.py | 31 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/thrift/blob/7f01e2a8/lib/py/src/protocol/TJSONProtocol.py ---------------------------------------------------------------------- diff --git a/lib/py/src/protocol/TJSONProtocol.py b/lib/py/src/protocol/TJSONProtocol.py index 3ed8bcb..e98f4cf 100644 --- a/lib/py/src/protocol/TJSONProtocol.py +++ b/lib/py/src/protocol/TJSONProtocol.py @@ -243,7 +243,14 @@ class TJSONProtocolBase(TProtocolBase): raise TProtocolException(TProtocolException.INVALID_DATA, "Unexpected character: %s" % current) + def _isHighSurrogate(self, codeunit): + return codeunit >= 0xd800 and codeunit <= 0xdbff + + def _isLowSurrogate(self, codeunit): + return codeunit >= 0xdc00 and codeunit <= 0xdfff + def readJSONString(self, skipContext): + highSurrogate = None string = [] if skipContext is False: self.context.read() @@ -255,7 +262,26 @@ class TJSONProtocolBase(TProtocolBase): if ord(character) == ESCSEQ0: character = self.reader.read() if ord(character) == ESCSEQ1: - character = chr(int(self.trans.read(4))) + if sys.version_info[0] == 2: + import json + character = self.trans.read(4) + codeunit = int(character, 16) + if self._isHighSurrogate(codeunit): + if highSurrogate: + raise TProtocolException(TProtocolException.INVALID_DATA, + "Expected low surrogate char") + highSurrogate = character + continue + elif self._isLowSurrogate(codeunit): + if not highSurrogate: + raise TProtocolException(TProtocolException.INVALID_DATA, + "Expected high surrogate char") + character = json.JSONDecoder().decode('"\\u%s\\u%s"' % (highSurrogate, character)).encode('utf-8') + highSurrogate = None + else: + character = json.JSONDecoder().decode('"\\u%s"' % character).encode('utf-8') + else: + character = chr(int(self.trans.read(4))) else: if character not in ESCAPE_CHARS: raise TProtocolException(TProtocolException.INVALID_DATA, @@ -270,6 +296,10 @@ class TJSONProtocolBase(TProtocolBase): utf8_bytes.append(ord(self.reader.read())) character = utf8_bytes.decode('utf8') string.append(character) + + if highSurrogate: + raise TProtocolException(TProtocolException.INVALID_DATA, + "Expected low surrogate char") return ''.join(string) def isJSONNumeric(self, character): http://git-wip-us.apache.org/repos/asf/thrift/blob/7f01e2a8/lib/py/test/thrift_json.py ---------------------------------------------------------------------- diff --git a/lib/py/test/thrift_json.py b/lib/py/test/thrift_json.py new file mode 100644 index 0000000..cef8870 --- /dev/null +++ b/lib/py/test/thrift_json.py @@ -0,0 +1,31 @@ +from thrift import Thrift +from thrift.protocol.TJSONProtocol import TJSONProtocol +from thrift.transport import TTransport + +import sys +import unittest + +# +# In order to run the test under Windows. We need to create symbolic link +# name 'thrift' to '../src' folder by using: +# +# mklink /D thrift ..\src +# + +class TestJSONString(unittest.TestCase): + + def test_escaped_unicode_string(self): + unicode_json = '"hello \\u0e01\\u0e02\\u0e03\\ud835\\udcab unicode"' + unicode_text = u'hello \u0e01\u0e02\u0e03\U0001D4AB unicode' + + buf = TTransport.TMemoryBuffer(unicode_json) + transport = TTransport.TBufferedTransportFactory().getTransport(buf) + protocol = TJSONProtocol(transport) + + if sys.version_info[0] == 2: + unicode_text = unicode_text.encode('utf8') + self.assertEqual(protocol.readString(), unicode_text) + +if __name__ == '__main__': + unittest.main() +
