On Thu, Oct 13, 2016 at 10:10 AM, Martijn Pieters <m...@zopatista.com> wrote:
> # HG changeset patch > # User Martijn Pieters <mjpiet...@fb.com> > # Date 1476346188 -3600 > # Thu Oct 13 09:09:48 2016 +0100 > # Node ID 81d23b9e2b329666db6e342f6bafec54a893687c > # Parent 733fb9f7bc92c694ba6bededaeb93206528c0bcd > py3: refactor token parsing to handle call args properly > > The token parsing was getting unwieldy and was too naive about accessing > arguments. > > diff --git a/mercurial/__init__.py b/mercurial/__init__.py > --- a/mercurial/__init__.py > +++ b/mercurial/__init__.py > @@ -185,6 +185,61 @@ > OR CACHED FILES WON'T GET INVALIDATED PROPERLY. > """ > futureimpline = False > + > + # The following utility functions access the tokens list and i > index of > + # the for i, t enumerate(tokens) loop below > + def _is_op(j, *o): > + """Assert that tokens[j] is an OP with one of the given > values""" > + try: > + return tokens[j].type == token.OP and tokens[j].string in > o > + except IndexError: > + return False > + > + def _find_argn_of_call(n): > + """Find arg n of a call expression (start at 0) > + > + Returns index of the first token of that argument, or None if > + there is not that many arguments. > + > + Assumes that token[i + 1] is '('. > + > + """ > + j = i + 2 > + nested = 0 > + try: > + while True: > + if _is_op(j, ')', ']', '}'): > + # end of call, tuple, subscription or dict / set > + nested -= 1 > + if nested < 0: > + return None > + elif n == 0: > + # this is the starting position of arg > + return j > + elif _is_op(j, '(', '[', '{'): > + nested += 1 > + elif _is_op(j, ',') and nested == 0: > + n -= 1 > + j += 1 > + except IndexError: > + return None > + > + def _ensure_unicode(j): > + """Make sure the token at j is a unicode string > + > + This rewrites a string token to include the unicode literal > prefix > + so the string transformer won't add the byte prefix. > + > + Ignores tokens that are not strings. Assumes bounds checking > has > + already been done. > + > + """ > + st = tokens[j] > + if st.type == token.STRING and st.string.startswith(("'", > '"')): > + rt = tokenize.TokenInfo(st.type, 'u%s' % st.string, > + st.start, st.end, st.line) > + tokens[j] = rt > + > for i, t in enumerate(tokens): > # Convert most string literals to byte literals. String > literals > # in Python 2 are bytes. String literals in Python 3 are > unicode. > @@ -241,91 +296,35 @@ > '') > continue > > - try: > - nexttoken = tokens[i + 1] > - except IndexError: > - nexttoken = None > - > - try: > - prevtoken = tokens[i - 1] > - except IndexError: > - prevtoken = None > - > # This looks like a function call. > - if (t.type == token.NAME and nexttoken and > - nexttoken.type == token.OP and nexttoken.string == '('): > + if t.type == token.NAME and _is_op(i + 1, '('): > fn = t.string > > # *attr() builtins don't accept byte strings to 2nd > argument. > - # Rewrite the token to include the unicode literal prefix > so > - # the string transformer above doesn't add the byte > prefix. > - if fn in ('getattr', 'setattr', 'hasattr', 'safehasattr'): > - try: > - # (NAME, 'getattr') > - # (OP, '(') > - # (NAME, 'foo') > - # (OP, ',') > - # (NAME|STRING, foo) > - st = tokens[i + 4] > - if (st.type == token.STRING and > - st.string[0] in ("'", '"')): > - rt = tokenize.TokenInfo(st.type, 'u%s' % > st.string, > - st.start, st.end, > st.line) > - tokens[i + 4] = rt > - except IndexError: > - pass > + if (fn in ('getattr', 'setattr', 'hasattr', > 'safehasattr') and > + not _is_op(i - 1, '.')): > + arg1idx = _find_argn_of_call(1) > + if arg1idx is not None: > + _ensure_unicode(arg1idx) > > # .encode() and .decode() on str/bytes/unicode don't > accept > - # byte strings on Python 3. Rewrite the token to include > the > - # unicode literal prefix so the string transformer above > doesn't > - # add the byte prefix. The loop helps in handling multiple > - # arguments. > - if (fn in ('encode', 'decode') and > - prevtoken.type == token.OP and prevtoken.string == > '.'): > - # (OP, '.') > - # (NAME, 'encode') > - # (OP, '(') > - # [(VARIABLE, encoding)] > - # [(OP, '.')] > - # [(VARIABLE, encoding)] > - # [(OP, ',')] > - # (STRING, 'utf-8') > - # (OP, ')') > - j = i > - try: > - while (tokens[j + 1].string in ('(', ',', '.')): > - st = tokens[j + 2] > - if (st.type == token.STRING and > - st.string[0] in ("'", '"')): > - rt = tokenize.TokenInfo(st.type, > - 'u%s' % st.string, > - st.start, st.end, > st.line) > - tokens[j + 2] = rt > - j = j + 2 > - except IndexError: > - pass > + # byte strings on Python 3. > + elif fn in ('encode', 'decode') and _is_op(i - 1, '.'): > + for argn in range(2): > + argidx = _find_argn_of_call(argn) > + if argidx is not None: > + _ensure_unicode(argidx) > > - # Bare open call (not an attribute on something else) > - if (fn == 'open' and not (prevtoken.type == token.OP and > - prevtoken.string == '.')): > - try: > - # (NAME, 'open') > - # (OP, '(') > - # (NAME|STRING, 'filename') > - # (OP, ',') > - # (NAME|STRING, mode) > - st = tokens[i + 4] > - if (st.type == token.STRING and > - st.string[0] in ("'", '"')): > - rt = tokenize.TokenInfo(st.type, 'u%s' % > st.string, > - st.start, st.end, > st.line) > - tokens[i + 4] = rt > - except IndexError: > - pass > + # Bare open call (not an attribute on something else), the > + # second argument (mode) must be a string, not bytes > + elif fn == 'open' and not _is_op(i - 1, '.'): > + arg1idx = _find_argn_of_call(1) > + if arg1idx is not None: > + _ensure_unicode(arg1idx) > > # It changes iteritems to items as iteritems is not > # present in Python 3 world. > - if fn == 'iteritems': > + elif fn == 'iteritems': > yield tokenize.TokenInfo(t.type, 'items', > t.start, t.end, t.line) > continue > @@ -337,7 +336,7 @@ > # ``replacetoken`` or any mechanism that changes semantics of module > # loading is changed. Otherwise cached bytecode may get loaded without > # the new transformation mechanisms applied. > - BYTECODEHEADER = b'HG\x00\x05' > + BYTECODEHEADER = b'HG\x00\x06' > > class hgloader(importlib.machinery.SourceFileLoader): > """Custom module loader that transforms source code. > Nice refactor. While I haven't looked at the code in detail yet, does check-code not complain about the use of underscore_function_names?
_______________________________________________ Mercurial-devel mailing list Mercurial-devel@mercurial-scm.org https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel