Author: jmorliaguet Date: Sat Feb 18 15:46:34 2006 New Revision: 2415 Modified: cpsskins/branches/jmo-perspectives/minjson.py Log:
- upgraded to the latest version of minjson.py Modified: cpsskins/branches/jmo-perspectives/minjson.py ============================================================================== --- cpsskins/branches/jmo-perspectives/minjson.py (original) +++ cpsskins/branches/jmo-perspectives/minjson.py Sat Feb 18 15:46:34 2006 @@ -16,11 +16,24 @@ # reads minimal javascript objects. # str's objects and fixes the text to write javascript. +#UNICODE USAGE: Minjson tries hard to accommodate naive usage in a +#"Do what I mean" manner. Real applications should handle unicode separately. +# The "right" way to use minjson in an application is to provide minjson a +# python unicode string for reading and accept a unicode output from minjson's +# writing. That way, the assumptions for unicode are yours and not minjson's. + +# That said, the minjson code has some (optional) unicode handling that you +# may look at as a model for the unicode handling your application may need. + # Thanks to Patrick Logan for starting the json-py project and making so many # good test cases. -# Jim Washington 10 Oct 2005. +# Additional thanks to Balazs Ree for replacing the writing module. +# Jim Washington 30 Dec 2005. + +# 2005-12-30 writing now traverses the object tree instead of relying on +# str() or unicode() # 2005-10-10 on reading, looks for \\uxxxx and replaces with u'\uxxxx' # 2005-10-09 now tries hard to make all strings unicode when reading. # 2005-10-07 got rid of eval() completely, makes object as found by the @@ -37,17 +50,15 @@ from token import ENDMARKER, NAME, NUMBER, STRING, OP, ERRORTOKEN from tokenize import tokenize, TokenError, NL -# set to true if transmission size is much more important than speed -# only affects writing, and makes a minimal difference in output size. -alwaysStripWhiteSpace = False - -# set this to True if you want chars above 128 always expressed as /uxxx -# this is expensive. -doUxxxx = False - #Usually, utf-8 will work, set this to utf-16 if you dare. emergencyEncoding = 'utf-8' +class ReadException(Exception): + pass + +class WriteException(Exception): + pass + ################################# # read JSON object # ################################# @@ -76,6 +87,7 @@ class CommaToken: __slots__=[] pass + class JSONReader(object): """raise SyntaxError if it is not JSON, and make the object available""" def __init__(self,data): @@ -85,7 +97,7 @@ self.lastOp = None self.objects = [] self.tokenize() - + def tokenize(self): try: tokenize(self._data.next,self.readTokens) @@ -143,7 +155,7 @@ if not isinstance(key,basestring): raise SyntaxError except IndexError: - + raise SyntaxError #salt the while comma = value @@ -241,8 +253,8 @@ def safeRead(aString, encoding=None): """read the js, first sanitizing a bit and removing any c-style comments - If the input is a unicode string, that's OK. If the input is a byte string, - strings in the object will be produced as unicode anyway. + If the input is a unicode string, great. That's preferred. If the input + is a byte string, strings in the object will be produced as unicode anyway. """ # get rid of trailing null. Konqueror appends this. CHR0 = chr(0) @@ -264,7 +276,7 @@ s = aString else: if encoding: - # note: no "try" here. the encoding provided must work for the + # note: no "try" here. the encoding provided must work for the # incoming byte string. UnicodeDecode error will be raised # in that case. Often, it will be best not to provide the encoding # and allow the default @@ -294,213 +306,131 @@ # write object as JSON # ################################# -#alwaysStripWhiteSpace is defined at the top of the module +import re, codecs +from cStringIO import StringIO -tfnTuple = (('True','true'),('False','false'),('None','null'),) +### Codec error handler -def _replaceTrueFalseNone(aString): - """replace True, False, and None with javascript counterparts""" - for k in tfnTuple: - if k[0] in aString: - aString = aString.replace(k[0],k[1]) - return aString - -def _handleCode(subStr,stripWhiteSpace): - """replace True, False, and None with javascript counterparts if - appropriate, remove unicode u's, fix long L's, make tuples - lists, and strip white space if requested - """ - if 'e' in subStr: - #True, False, and None have 'e' in them. :) - subStr = (_replaceTrueFalseNone(subStr)) - if stripWhiteSpace: - # re.sub might do a better job, but takes longer. - # Spaces are the majority of the whitespace, anyway... - subStr = subStr.replace(' ','') - if subStr[-1] in "uU": - #remove unicode u's - subStr = subStr[:-1] - if "L" in subStr: - #remove Ls from long ints - subStr = subStr.replace("L",'') - #do tuples as lists - if "(" in subStr: - subStr = subStr.replace("(",'[') - if ")" in subStr: - subStr = subStr.replace(")",']') - return subStr - -# re for a double-quoted string that has a single-quote in it -# but no double-quotes and python punctuation after: -redoublequotedstring = compile(r'"[^"]*\'[^"]*"[,\]\}:\)]') -escapedSingleQuote = r"\'" -escapedDoubleQuote = r'\"' - -def _doQuotesSwapping(aString): - """rewrite doublequoted strings with single quotes as singlequoted strings with - escaped single quotes""" - s = [] - foundlocs = redoublequotedstring.finditer(aString) - prevend = 0 - for loc in foundlocs: - start,end = loc.span() - s.append(aString[prevend:start]) - tempstr = aString[start:end] - endchar = tempstr[-1] - ts1 = tempstr[1:-2] - ts1 = ts1.replace("'",escapedSingleQuote) - ts1 = "'%s'%s" % (ts1,endchar) - s.append(ts1) - prevend = end - s.append(aString[prevend:]) - return ''.join(s) - -strEscapes = (('\n',r'\n'),('\b',r'\b'), - ('\f',r'\f'),('\t',r'\t'),('\r',r'\r'),('\u',r'\u') ) - -unicodeRE = compile(u"([\u0080-\uffff])") -unicodeREfunction = lambda(x): r"\u%04x" % ord(x.group(1)) - -slashxRX = compile(r"\\x[0-9a-fA-F]{2,2}") +def jsonreplace_handler(exc): + '''Error handler for json -xmlcharRX = compile(r"&#[0-9a-fA-F]{2,4};") + If encoding fails, \\uxxxx must be emitted. This + is similar to the "backshashreplace" handler, only + that we never emit \\xnn since this is not legal + according to the JSON syntax specs. + ''' + if isinstance(exc, UnicodeEncodeError): + part = exc.object[exc.start] + # repr(part) will convert u'\unnnn' to u'u\\nnnn' + return u'\\u%04x' % ord(part), exc.start+1 + else: + raise exc -def slashxRXReplace(match): - return unichr(int(match.group()[2:],16)) +# register the error handler +codecs.register_error('jsonreplace', jsonreplace_handler) -def uxreplace(match): - c = match.group()[2:-1] - l = len(c) - if l == 2: - return "\\u00%s" % c - elif l == 3: - return "\\u0%s" % c - elif l == 4: - return "\\u%s" % c - -def _escapeSomeStringChars(aString): - """replace single-character chars that have an escaped - representation with their literals""" - if doUxxxx: - # escape anything above 128 as \uxxxx - if unicodeRE.search(aString): - aString = unicodeRE.sub(unicodeREfunction, aString) - - for character,replacement in strEscapes: - if character in aString: - aString = aString.replace(character,replacement) - return aString - -def _pyexpr2jsexpr(aString, stripWhiteSpace): - """Take advantage of python's formatting of string representations of - objects. Python always uses "'" to delimit strings. Except it doesn't when - there is ' in the string. Fix that, then, if we split - on that delimiter, we have a list that alternates non-string text with - string text. Since string text is already properly escaped, we - only need to replace True, False, and None in non-string text and - remove any unicode 'u's preceding string values. +### Writer - if stripWhiteSpace is True, remove spaces, etc from the non-string - text. - """ - #do some escaping first - aString = _escapeSomeStringChars(aString) - #python will quote with " when there is a ' in the string, - #so fix that first - if redoublequotedstring.search(aString): - aString = _doQuotesSwapping(aString) - marker = None - if escapedSingleQuote in aString: - #replace escaped single quotes with a marker - marker = markerBase = '|' - markerCount = 1 - while marker in aString: - #if the marker is already there, make it different - markerCount += 1 - marker = markerBase * markerCount - aString = aString.replace(escapedSingleQuote,marker) - #escape double-quotes - aString = aString.replace('"',escapedDoubleQuote) - #split the string on the real single-quotes - splitStr = aString.split("'") - outList = [] - alt = True - for subStr in splitStr: - if alt: - #if alt is True, non-string; do replacements - subStr = _handleCode(subStr,stripWhiteSpace) - outList.append(subStr) - alt = not alt - result = '"'.join(outList) - if marker: - #put the escaped single-quotes back as "'" - result = result.replace(marker,"'") - return result - -def write(obj, encoding='utf-8', stripWhiteSpace=alwaysStripWhiteSpace,\ - encodeOutput=None): - """Represent the object as a byte string in JSON notation. - - JSON specification says that the output is unicode, and what we really - usually want is an encoded byte string for output. Since this method cannot - know whether it is "at the boundary", it will by default output a python - unicode object that you can encode at the boundary. - - If there are bytestrings in the object that cannot be decoded in the system - default encoding, this will decode them to unicode by an encoding provided. - The default, utf-8, will work in most cases. - - This method includes an experimental encodeOutput parameter, which may - suffice to make a byte string is that's what you want. The encodeOutput - parameter needs to be the python identifier for the desired charset - encoding. By default, a python unicode string will be output. - - """ - # first, we get a representation of the object in unicode - # we will encode to an encoding later if desired. - if not isinstance(obj,unicode): - #get representation of object as unicode. - aString = str(obj) - if encoding: - aString = unicode(aString, encoding) +def write(input, encoding='utf-8', outputEncoding=None): + writer = JsonWriter(input_encoding=encoding, output_encoding=outputEncoding) + writer.write(input) + return writer.getvalue() + +re_strmangle = re.compile('"|\b|\f|\n|\r|\t|\\\\') + +def func_strmangle(match): + return { + '"': '\\"', + '\b': '\\b', + '\f': '\\f', + '\n': '\\n', + '\r': '\\r', + '\t': '\\t', + '\\': '\\\\', + }[match.group(0)] + +def strmangle(text): + return re_strmangle.sub(func_strmangle, text) + +class JsonStream(object): + + def __init__(self): + self.buf = [] + + def write(self, text): + self.buf.append(text) + + def getvalue(self): + return ''.join(self.buf) + +class JsonWriter(object): + + def __init__(self, stream=None, input_encoding='utf-8', output_encoding=None): + ''' + - stream is optional, if specified must also give output_encoding + - The input strings can be unicode or in input_encoding + - output_encoding is optional, if omitted, result will be unicode + ''' + if stream is not None: + if output_encoding is None: + raise WriteException, 'If a stream is given, output encoding must also be provided' else: - aString = unicode(aString, 'utf-8') - #if \xnn converted to \\xnn, put those chars back - if slashxRX.search(aString): - aString = slashxRX.sub(slashxRXReplace, aString) - else: - # it's already a unicode string, no need to convert to unicode - aString = obj - - if isinstance(obj,basestring): - aString = aString.replace('\\','\\\\') - if '"' in aString: - aString = '"%s"' % aString.replace('"',escapedDoubleQuote) + stream = JsonStream() + self.stream = stream + self.input_encoding = input_encoding + self.output_encoding = output_encoding + + def write(self, obj): + if isinstance(obj, (list, tuple)): + self.stream.write('[') + first = True + for elem in obj: + if first: + first = False + else: + self.stream.write(',') + self.write(elem) + self.stream.write(']'), + elif isinstance(obj, dict): + self.stream.write('{') + first = True + for key, value in obj.iteritems(): + if first: + first = False + else: + self.stream.write(',') + self.write(key) + self.stream.write(':') + self.write(value) + self.stream.write('}') + elif obj is True: + self.stream.write('true') + elif obj is False: + self.stream.write('false') + elif obj is None: + self.stream.write('null') + elif not isinstance(obj, basestring): + # if we are not baseobj, convert to it + try: + obj = str(obj) + except Exception, exc: + raise WriteException, 'Cannot write object (%s: %s)' % (exc.__class__, exc) + self.stream.write(obj) else: - aString = '"%s"' % aString - result = _escapeSomeStringChars(aString) - else: - result = _pyexpr2jsexpr(aString,stripWhiteSpace) - - #assert isinstance(result,unicode) - - if encodeOutput: - # we have a choice here; xmlcharrefreplace or backslashreplace. - # since it is likely a web thing, we choose xmlcharrefreplace and - # convert that back to \\unnn representation. This is experimental - # and may not do the right thing. - result = result.encode(encodeOutput,"xmlcharrefreplace") - if xmlcharRX.search(result): - #if \xnn converted to &#nn;, put those chars back as \\u00nn - #print "result before conversion: %s" % result - result = xmlcharRX.sub(uxreplace, result) - #print "result after conversion: %s" % result - # Waah! it replaces unprintable chars in the charset with printable - # but different chars. It should be rare. - return result - -class ReadException(Exception): - pass + # convert to unicode first + if not isinstance(obj, unicode): + try: + obj = unicode(obj, self.input_encoding) + except (UnicodeDecodeError, UnicodeTranslateError): + obj = unicode(obj, 'utf-8', 'replace') + # do the mangling + obj = strmangle(obj) + # make the encoding + if self.output_encoding is not None: + obj = obj.encode(self.output_encoding, 'jsonreplace') + self.stream.write('"') + self.stream.write(obj) + self.stream.write('"') -class WriteException(Exception): - pass + def getvalue(self): + return self.stream.getvalue() -- http://lists.nuxeo.com/mailman/listinfo/z3lab-checkins