jmo-perspectives

svn Mon, 19 Dec 2005 13:31:37 -0800

Author: jmorliaguet
Date: Mon Dec 19 22:31:33 2005
New Revision: 2059

Added:
   cpsskins/branches/jmo-perspectives/minjson.py   (contents, props changed)
Log:


- added python-based JSON parser / serializer



Added: cpsskins/branches/jmo-perspectives/minjson.py
==============================================================================
--- (empty file)
+++ cpsskins/branches/jmo-perspectives/minjson.py       Mon Dec 19 22:31:33 2005
@@ -0,0 +1,506 @@
+##############################################################################
+#
+# Copyright (c) 2005 Jim Washington and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+# minjson.py
+# reads minimal javascript objects.
+# str's objects and fixes the text to write javascript.
+
+# Thanks to Patrick Logan for starting the json-py project and making so many
+# good test cases.
+
+# Jim Washington 10 Oct 2005.
+
+# 2005-10-10 on reading, looks for \\uxxxx and replaces with u'\uxxxx'
+# 2005-10-09 now tries hard to make all strings unicode when reading.
+# 2005-10-07 got rid of eval() completely, makes object as found by the
+#            tokenizer.
+# 2005-09-06 imported parsing constants from tokenize; they changed a bit from
+#            python2.3 to 2.4
+# 2005-08-22 replaced the read sanity code
+# 2005-08-21 Search for exploits on eval() yielded more default bad operators.
+# 2005-08-18 Added optional code from Koen van de Sande to escape
+#            outgoing unicode chars above 128
+
+
+from re import compile, sub, search, DOTALL
+from token import ENDMARKER, NAME, NUMBER, STRING, OP, ERRORTOKEN
+from tokenize import tokenize, TokenError, NL
+
+# set to true if transmission size is much more important than speed
+# only affects writing, and makes a minimal difference in output size.
+alwaysStripWhiteSpace = False
+
+# set this to True if you want chars above 128 always expressed as /uxxx
+# this is expensive.
+doUxxxx = False
+
+#Usually, utf-8 will work, set this to utf-16 if you dare.
+emergencyEncoding = 'utf-8'
+
+#################################
+#      read JSON object         #
+#################################
+
+slashstarcomment = compile(r'/\*.*?\*/',DOTALL)
+doubleslashcomment = compile(r'//.*\n')
+
+unichrRE = compile(r"\\u[0-9a-fA-F]{4,4}")
+
+def unichrReplace(match):
+    return unichr(int(match.group()[2:],16))
+
+escapeStrs = (('\\','\\\\'),('\n',r'\n'),('\b',r'\b'),
+    ('\f',r'\f'),('\t',r'\t'),('\r',r'\r'), ('"',r'\"')
+    )
+
+class DictToken:
+    __slots__=[]
+    pass
+class ListToken:
+    __slots__=[]
+    pass
+class ColonToken:
+    __slots__=[]
+    pass
+class CommaToken:
+    __slots__=[]
+    pass
+class JSONReader(object):
+    """raise SyntaxError if it is not JSON, and make the object available"""
+    def __init__(self,data):
+        self.stop = False
+        #make an iterator of data so that next() works in tokenize.
+        self._data = iter([data])
+        self.lastOp = None
+        self.objects = []
+        self.tokenize()
+        
+    def tokenize(self):
+        try:
+            tokenize(self._data.next,self.readTokens)
+        except TokenError:
+            raise SyntaxError
+
+    def resolveList(self):
+        #check for empty list
+        if isinstance(self.objects[-1],ListToken):
+            self.objects[-1] = []
+            return
+        theList = []
+        commaCount = 0
+        try:
+            item = self.objects.pop()
+        except IndexError:
+            raise SyntaxError
+        while not isinstance(item,ListToken):
+            if isinstance(item,CommaToken):
+                commaCount += 1
+            else:
+                theList.append(item)
+            try:
+                item = self.objects.pop()
+            except IndexError:
+                raise SyntaxError
+        if not commaCount == (len(theList) -1):
+            raise SyntaxError
+        theList.reverse()
+        item = theList
+        self.objects.append(item)
+
+    def resolveDict(self):
+        theList = []
+        #check for empty dict
+        if isinstance(self.objects[-1], DictToken):
+            self.objects[-1] = {}
+            return
+        #not empty; must have at least three values
+        try:
+            #value (we're going backwards!)
+            value = self.objects.pop()
+        except IndexError:
+            raise SyntaxError
+        try:
+            #colon
+            colon = self.objects.pop()
+            if not isinstance(colon, ColonToken):
+                raise SyntaxError
+        except IndexError:
+            raise SyntaxError
+        try:
+            #key
+            key = self.objects.pop()
+            if not isinstance(key,basestring):
+                raise SyntaxError
+        except IndexError:
+            
+            raise SyntaxError
+        #salt the while
+        comma = value
+        while not isinstance(comma,DictToken):
+            # store the value
+            theList.append((key,value))
+            #do it again...
+            try:
+                #might be a comma
+                comma = self.objects.pop()
+            except IndexError:
+                raise SyntaxError
+            if isinstance(comma,CommaToken):
+                #if it's a comma, get the values
+                try:
+                    value = self.objects.pop()
+                except IndexError:
+                    #print self.objects
+                    raise SyntaxError
+                try:
+                    colon = self.objects.pop()
+                    if not isinstance(colon, ColonToken):
+                        raise SyntaxError
+                except IndexError:
+                    raise SyntaxError
+                try:
+                    key = self.objects.pop()
+                    if not isinstance(key,basestring):
+                        raise SyntaxError
+                except IndexError:
+                    raise SyntaxError
+        theDict = {}
+        for k in theList:
+            theDict[k[0]] = k[1]
+        self.objects.append(theDict)
+
+    def readTokens(self,type, token, (srow, scol), (erow, ecol), line):
+        # UPPERCASE consts from tokens.py or tokenize.py
+        if type == OP:
+            if token not in "[{}],:-":
+                raise SyntaxError
+            else:
+                self.lastOp = token
+            if token == '[':
+                self.objects.append(ListToken())
+            elif token == '{':
+                self.objects.append(DictToken())
+            elif token == ']':
+                self.resolveList()
+            elif token == '}':
+                self.resolveDict()
+            elif token == ':':
+                self.objects.append(ColonToken())
+            elif token == ',':
+                self.objects.append(CommaToken())
+        elif type == STRING:
+            tok = token[1:-1]
+            for k in escapeStrs:
+                if k[1] in tok:
+                    tok = tok.replace(k[1],k[0])
+            self.objects.append(tok)
+        elif type == NUMBER:
+            if self.lastOp == '-':
+                factor = -1
+            else:
+                factor = 1
+            try:
+                self.objects.append(factor * int(token))
+            except ValueError:
+                self.objects.append(factor * float(token))
+        elif type == NAME:
+            try:
+                self.objects.append({'true':True,
+                    'false':False,'null':None}[token])
+            except KeyError:
+                raise SyntaxError
+        elif type == ENDMARKER:
+            pass
+        elif type == NL:
+            pass
+        elif type == ERRORTOKEN:
+            if ecol == len(line):
+                #it's a char at the end of the line.  (mostly) harmless.
+                pass
+            else:
+                raise SyntaxError
+        else:
+            raise SyntaxError
+    def output(self):
+        try:
+            assert len(self.objects) == 1
+        except AssertionError:
+            raise SyntaxError
+        return self.objects[0]
+
+def safeRead(aString, encoding=None):
+    """read the js, first sanitizing a bit and removing any c-style comments
+    If the input is a unicode string, that's OK.  If the input is a byte 
string, 
+    strings in the object will be produced as unicode anyway.
+    """
+    # get rid of trailing null. Konqueror appends this.
+    CHR0 = chr(0)
+    while aString.endswith(CHR0):
+        aString = aString[:-1]
+    # strip leading and trailing whitespace
+    aString = aString.strip()
+    # zap /* ... */ comments
+    aString = slashstarcomment.sub('',aString)
+    # zap // comments
+    aString = doubleslashcomment.sub('',aString)
+    # detect and handle \\u unicode characters. Note: This has the side effect
+    # of converting the entire string to unicode. This is probably OK.
+    unicodechars = unichrRE.search(aString)
+    if unicodechars:
+        aString = unichrRE.sub(unichrReplace, aString)
+    #if it's already unicode, we won't try to decode it
+    if isinstance(aString, unicode):
+        s = aString
+    else:
+        if encoding:
+            # note: no "try" here.  the encoding provided must work for the 
+            # incoming byte string.  UnicodeDecode error will be raised
+            # in that case.  Often, it will be best not to provide the encoding
+            # and allow the default
+            s = unicode(aString, encoding)
+            #print "decoded %s from %s" % (s,encoding)
+        else:
+            # let's try to decode to unicode in system default encoding
+            try:
+                s = unicode(aString)
+                #import sys
+                #print "decoded %s from %s" % (s,sys.getdefaultencoding())
+            except UnicodeDecodeError:
+                # last choice: handle as emergencyEncoding
+                enc = emergencyEncoding
+                s = unicode(aString, enc)
+                #print "%s decoded from %s" % (s, enc)
+    # parse and get the object.
+    try:
+        data = JSONReader(s).output()
+    except SyntaxError:
+        raise ReadException, 'Unacceptable JSON expression: %s' % aString
+    return data
+
+read = safeRead
+
+#################################
+#   write object as JSON        #
+#################################
+
+#alwaysStripWhiteSpace is defined at the top of the module
+
+tfnTuple = (('True','true'),('False','false'),('None','null'),)
+
+def _replaceTrueFalseNone(aString):
+    """replace True, False, and None with javascript counterparts"""
+    for k in tfnTuple:
+        if k[0] in aString:
+            aString = aString.replace(k[0],k[1])
+    return aString
+
+def _handleCode(subStr,stripWhiteSpace):
+    """replace True, False, and None with javascript counterparts if
+       appropriate, remove unicode u's, fix long L's, make tuples
+       lists, and strip white space if requested
+    """
+    if 'e' in subStr:
+        #True, False, and None have 'e' in them. :)
+        subStr = (_replaceTrueFalseNone(subStr))
+    if stripWhiteSpace:
+        # re.sub might do a better job, but takes longer.
+        # Spaces are the majority of the whitespace, anyway...
+        subStr = subStr.replace(' ','')
+    if subStr[-1] in "uU":
+        #remove unicode u's
+        subStr = subStr[:-1]
+    if "L" in subStr:
+        #remove Ls from long ints
+        subStr = subStr.replace("L",'')
+    #do tuples as lists
+    if "(" in subStr:
+        subStr = subStr.replace("(",'[')
+    if ")" in subStr:
+        subStr = subStr.replace(")",']')
+    return subStr
+
+# re for a double-quoted string that has a single-quote in it
+# but no double-quotes and python punctuation after:
+redoublequotedstring = compile(r'"[^"]*\'[^"]*"[,\]\}:\)]')
+escapedSingleQuote = r"\'"
+escapedDoubleQuote = r'\"'
+
+def _doQuotesSwapping(aString):
+    """rewrite doublequoted strings with single quotes as singlequoted strings 
with
+    escaped single quotes"""
+    s = []
+    foundlocs = redoublequotedstring.finditer(aString)
+    prevend = 0
+    for loc in foundlocs:
+        start,end = loc.span()
+        s.append(aString[prevend:start])
+        tempstr = aString[start:end]
+        endchar = tempstr[-1]
+        ts1 = tempstr[1:-2]
+        ts1 = ts1.replace("'",escapedSingleQuote)
+        ts1 = "'%s'%s" % (ts1,endchar)
+        s.append(ts1)
+        prevend = end
+    s.append(aString[prevend:])
+    return ''.join(s)
+
+strEscapes = (('\n',r'\n'),('\b',r'\b'),
+    ('\f',r'\f'),('\t',r'\t'),('\r',r'\r'),('\u',r'\u') )
+
+unicodeRE = compile(u"([\u0080-\uffff])")
+unicodeREfunction = lambda(x): r"\u%04x" % ord(x.group(1))
+    
+slashxRX = compile(r"\\x[0-9a-fA-F]{2,2}")
+
+xmlcharRX = compile(r"&#[0-9a-fA-F]{2,4};")
+
+def slashxRXReplace(match):
+    return unichr(int(match.group()[2:],16))
+
+def uxreplace(match):
+    c = match.group()[2:-1]
+    l = len(c)
+    if l == 2:
+        return "\\u00%s" % c
+    elif l == 3:
+        return "\\u0%s" % c
+    elif l == 4:
+        return "\\u%s" % c
+
+def _escapeSomeStringChars(aString):
+    """replace single-character chars that have an escaped
+    representation with their literals"""
+    if doUxxxx:
+        # escape anything above 128 as \uxxxx
+        if unicodeRE.search(aString):
+            aString = unicodeRE.sub(unicodeREfunction, aString)
+    
+    for character,replacement in strEscapes:
+        if character in aString:
+            aString = aString.replace(character,replacement)
+    return aString
+
+def _pyexpr2jsexpr(aString, stripWhiteSpace):
+    """Take advantage of python's formatting of string representations of
+    objects.  Python always uses "'" to delimit strings.  Except it doesn't 
when
+    there is ' in the string.  Fix that, then, if we split
+    on that delimiter, we have a list that alternates non-string text with
+    string text.  Since string text is already properly escaped, we
+    only need to replace True, False, and None in non-string text and
+    remove any unicode 'u's preceding string values.
+
+    if stripWhiteSpace is True, remove spaces, etc from the non-string
+    text.
+    """
+    #do some escaping first
+    aString = _escapeSomeStringChars(aString)
+    #python will quote with " when there is a ' in the string,
+    #so fix that first
+    if redoublequotedstring.search(aString):
+        aString = _doQuotesSwapping(aString)
+    marker = None
+    if escapedSingleQuote in aString:
+        #replace escaped single quotes with a marker
+        marker = markerBase = '|'
+        markerCount = 1
+        while marker in aString:
+            #if the marker is already there, make it different
+            markerCount += 1
+            marker = markerBase * markerCount
+        aString = aString.replace(escapedSingleQuote,marker)
+    #escape double-quotes
+    aString = aString.replace('"',escapedDoubleQuote)
+    #split the string on the real single-quotes
+    splitStr = aString.split("'")
+    outList = []
+    alt = True
+    for subStr in splitStr:
+        if alt:
+            #if alt is True, non-string; do replacements
+            subStr = _handleCode(subStr,stripWhiteSpace)
+        outList.append(subStr)
+        alt = not alt
+    result = '"'.join(outList)
+    if marker:
+        #put the escaped single-quotes back as "'"
+        result = result.replace(marker,"'")
+    return result
+
+def write(obj, encoding='utf-8', stripWhiteSpace=alwaysStripWhiteSpace,\
+    encodeOutput=None):
+    """Represent the object as a byte string in JSON notation.
+    
+    JSON specification says that the output is unicode, and what we really 
+    usually want is an encoded byte string for output. Since this method cannot
+    know whether it is "at the boundary", it will by default output a python 
+    unicode object that you can encode at the boundary.  
+    
+    If there are bytestrings in the object that cannot be decoded in the system
+    default encoding, this will decode them to unicode by an encoding provided.
+    The default, utf-8, will work in most cases.  
+    
+    This method includes an experimental encodeOutput parameter, which may 
+    suffice to make a byte string is that's what you want.  The encodeOutput 
+    parameter needs to be the python identifier for the desired charset 
+    encoding.  By default, a python unicode string will be output.
+        
+    """
+    #  first, we get a representation of the object in unicode
+    #  we will encode to an encoding later if desired.
+    if not isinstance(obj,unicode):
+        #get representation of object as unicode.
+        aString = str(obj)
+        if encoding:
+            aString = unicode(aString, encoding)
+        else:
+            aString = unicode(aString, 'utf-8')
+        #if \xnn converted to \\xnn, put those chars back
+        if slashxRX.search(aString):
+            aString = slashxRX.sub(slashxRXReplace, aString)
+    else:
+        # it's already a unicode string, no need to convert to unicode
+        aString = obj
+    
+    if isinstance(obj,basestring):
+        aString = aString.replace('\\','\\\\')
+        if '"' in aString:
+            aString = '"%s"' % aString.replace('"',escapedDoubleQuote)
+        else:
+            aString = '"%s"' % aString
+        result = _escapeSomeStringChars(aString)
+    else:
+        result = _pyexpr2jsexpr(aString,stripWhiteSpace)
+        
+    #assert isinstance(result,unicode)
+    
+    if encodeOutput:
+        # we have a choice here; xmlcharrefreplace or backslashreplace.
+        # since it is likely a web thing, we choose xmlcharrefreplace and 
+        # convert that back to \\unnn representation.  This is experimental
+        # and may not do the right thing.
+        result = result.encode(encodeOutput,"xmlcharrefreplace")
+        if xmlcharRX.search(result):
+            #if \xnn converted to &#nn;, put those chars back as \\u00nn
+            #print "result before conversion: %s" % result
+            result = xmlcharRX.sub(uxreplace, result)
+            #print "result after conversion: %s" % result
+            # Waah! it replaces unprintable chars in the charset with printable
+            # but different chars.  It should be rare.
+    return result
+
+class ReadException(Exception):
+    pass
+
+class WriteException(Exception):
+    pass
-- 
http://lists.nuxeo.com/mailman/listinfo/z3lab-checkins

[Z3lab-checkins] r2059 - cpsskins/branches/jmo-perspectives

Reply via email to