Re: A new possible approach to importing python source files

vitalije Fri, 03 Dec 2021 01:42:32 -0800


On Thursday, December 2, 2021 at 10:31:56 AM UTC+1 Edward K. Ream wrote:


>
> > Idea: use tokenize python module...
>
> This idea might work, but the attached script "wishes away" the 
> difficulties involved in generating code.
>
>
Attached to this message is an improved version of this script, which 
doesn't "wish away the difficulties", but does full import. Each module 
level defined function and class goes into a separate node and furthermore 
for each class node that has more than 20 lines, each method goes into a 
separate child node to. All other lines that are between the nodes are in 
their separate nodes with the heading '..some declarations'. If the body 
contains only one string definition its headline is changed to `__doc__` 
and if it contains only comments, then its headline is changed to '...some 
comments'.

This is pretty much how I would split any module by hand. I don't know what 
your tests for Python importer look like, but I suggest that you try this 
script against those tests. The whole script is just 150 lines long 
including the comments and doc strings. It is short and it is easy to 
modify it any way you like. It has successfully imported every python 
module from the standard library that I throw at it, each time the perfect 
import.

I would expect that your code in the end will have much more than 150 
lines. The more lines of code, more space for bugs to hide, more effort 
later to read and understand the code in order to change or fix something. 
The choice is yours.

-- 
You received this message because you are subscribed to the Google Groups 
"leo-editor" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/leo-editor/ce3150fb-ea17-4e18-87e9-9e140c7529f5n%40googlegroups.com.

import tokenize
import token
import io
from collections import defaultdict
c.frame.log.clearTab('Log')
def mk_py_importer():
    def find_node_borders(txt):
        '''
        Returns a list of tuples (startrow, endrow, headline)
        for direct children of the node.
        '''
        inp = io.StringIO(txt)
        tokens = list(tokenize.generate_tokens(inp.readline))
        res = []
        open_definitions = defaultdict(list)
        lastindent = 0
        def close_defs(row, col):
            for k in open_definitions:
                if k >= col:
                    for r in open_definitions[k]:
                        if r[2] is None: r[2] = row
                    del open_definitions[k][:]

        for i, tok in enumerate(tokens):
            row, col = tok[2]
            if tok[0] == token.INDENT:
                lastindent = col+len(tok[1])
                continue
            if (tok[0] == token.COMMENT and lastindent > col) or tok[0] == token.DEDENT:
                close_defs(row, col)
            elif tok[0] == token.NAME and tok[1] in ('def', 'class'):
                res.append([row, col, None, tok[-1].strip()])
                open_definitions[col].append(res[-1])
        i = 1
        nodes = [[1,1, '']]
        for a, col, b, x in res:
            if col > 0: continue # ignore deeper definitions
            if a > nodes[-1][1]:
                nodes.append([nodes[-1][1], a, '...some declarations'])
            nodes.append([a, b, make_headline(x)])
        nodes.append([nodes[-1][1], None, ''])
        return nodes
    def make_headline(line):
        line = line.strip()
        if line.startswith('class '):
            return line[5:].strip()[:-1]
        else:
            return line[4:].partition('(')[0].strip()
    def rename(p):
        toks = [x for x in tokenize.generate_tokens(io.StringIO(p.b).readline)
                if x[0] not in (token.NEWLINE, token.NL, token.ENDMARKER)]
        if all(x[0]==token.STRING for x in toks):
            p.h = '__doc__'
        elif all(x[0] == token.COMMENT for x in toks):
            p.h = '...comments'

    def split_root(root):
        '''
        Parses the text of the body and separates all
        top level function definitions and class definitions
        in separate nodes which are all direct children of
        the root.
        
        In the second phase, this function can be called on
        each of the children with more than a certain threshold
        number of lines.
        '''
        root.deleteAllChildren()
        txt = root.b
        lines = txt.splitlines(True)
        def body(a, b):
            return ''.join(lines[a-1:b and (b-1)])
        nodes = find_node_borders(txt)
        a, b, h = nodes[0]
        root.b = f'{body(a, b)}@others\n{body(nodes[-1][0], None)}'
        for a, b, h in nodes[1:-1]:
            child = root.insertAsLastChild()
            child.h = h
            child.b = body(a, b)
            if child.b.startswith('class ') and (b - a) > 20:
                split_class(child)
            if h == '...some declarations':rename(child)
    def split_class(p):
        lines = p.b.splitlines(True)
        if len(lines) < 20: return
        lws = [len(x) - len(x.lstrip()) for x in lines[1:] if x and not x.isspace()]
        ind = min(lws)
        def indent(x):
            return ' '*ind + x
        nlines = [x[ind:] if len(x) > ind else x for x in lines[1:]]
        txt = ''.join(nlines)
        nodes = find_node_borders(txt)
        a, b, h = nodes[0]
        def body(a, b):
            return ''.join(nlines[a-1:b and (b-1)])
        b1 = ''.join(lines[a:b]) + indent('@others\n')
        a, b, h = nodes.pop()
        b2 = ''.join(indent(x) for x in nlines[a-1:])
        p.b = f'{lines[0]}{b1}{b2}'
        for a, b, h in nodes[1:]:
            child = p.insertAsLastChild()
            child.h = h
            child.b = body(a, b)
            if h == '...some declarations':rename(child)
    def import_py_file(p, fname):
        with open(fname, 'r') as inp:
            p.b = inp.read()
            split_root(p)
    return split_root, import_py_file

split_root, import_py_file = mk_py_importer()

del mk_py_importer
def import_one_level(fname):
    '''
    this function demonstrates usage of split_root function
    it loads given python file in the test node and checks
    to see if the import is perfect or not
    '''
    with open(fname, 'r') as inp:
        txt = inp.read()
    root = ensure_root(p, 'py import test node')
    root.b = txt
    split_root(root)
    txt2 = g.getScript(c, root, useSentinels=False)
    if txt != txt2:
        g.es('differrent')
    else:
        g.es('same')
def ensure_root(p, name):
    '''
    this is just a utility for testing script
    if there is no node in the outline with given name
    this function will add a node and set its headline
    to given name after the current position
    '''
    ps = c.find_h(name)
    if not ps:
        p1 = p.insertAfter()
        p1.h = name
        return p1
    else:
        return ps[0]
# you can choose whatever module here
# but for testing purposes we're
# going to import and parse difflib.py
# from the standard library
import difflib as module
import_one_level(module.__file__)
c.redraw()

Re: A new possible approach to importing python source files

Reply via email to