On Thursday, December 2, 2021 at 10:31:56 AM UTC+1 Edward K. Ream wrote:
>
> > Idea: use tokenize python module...
>
> This idea might work, but the attached script "wishes away" the
> difficulties involved in generating code.
>
>
Attached to this message is an improved version of this script, which
doesn't "wish away the difficulties", but does full import. Each module
level defined function and class goes into a separate node and furthermore
for each class node that has more than 20 lines, each method goes into a
separate child node to. All other lines that are between the nodes are in
their separate nodes with the heading '..some declarations'. If the body
contains only one string definition its headline is changed to `__doc__`
and if it contains only comments, then its headline is changed to '...some
comments'.
This is pretty much how I would split any module by hand. I don't know what
your tests for Python importer look like, but I suggest that you try this
script against those tests. The whole script is just 150 lines long
including the comments and doc strings. It is short and it is easy to
modify it any way you like. It has successfully imported every python
module from the standard library that I throw at it, each time the perfect
import.
I would expect that your code in the end will have much more than 150
lines. The more lines of code, more space for bugs to hide, more effort
later to read and understand the code in order to change or fix something.
The choice is yours.
--
You received this message because you are subscribed to the Google Groups
"leo-editor" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/leo-editor/ce3150fb-ea17-4e18-87e9-9e140c7529f5n%40googlegroups.com.
import tokenize
import token
import io
from collections import defaultdict
c.frame.log.clearTab('Log')
def mk_py_importer():
def find_node_borders(txt):
'''
Returns a list of tuples (startrow, endrow, headline)
for direct children of the node.
'''
inp = io.StringIO(txt)
tokens = list(tokenize.generate_tokens(inp.readline))
res = []
open_definitions = defaultdict(list)
lastindent = 0
def close_defs(row, col):
for k in open_definitions:
if k >= col:
for r in open_definitions[k]:
if r[2] is None: r[2] = row
del open_definitions[k][:]
for i, tok in enumerate(tokens):
row, col = tok[2]
if tok[0] == token.INDENT:
lastindent = col+len(tok[1])
continue
if (tok[0] == token.COMMENT and lastindent > col) or tok[0] == token.DEDENT:
close_defs(row, col)
elif tok[0] == token.NAME and tok[1] in ('def', 'class'):
res.append([row, col, None, tok[-1].strip()])
open_definitions[col].append(res[-1])
i = 1
nodes = [[1,1, '']]
for a, col, b, x in res:
if col > 0: continue # ignore deeper definitions
if a > nodes[-1][1]:
nodes.append([nodes[-1][1], a, '...some declarations'])
nodes.append([a, b, make_headline(x)])
nodes.append([nodes[-1][1], None, ''])
return nodes
def make_headline(line):
line = line.strip()
if line.startswith('class '):
return line[5:].strip()[:-1]
else:
return line[4:].partition('(')[0].strip()
def rename(p):
toks = [x for x in tokenize.generate_tokens(io.StringIO(p.b).readline)
if x[0] not in (token.NEWLINE, token.NL, token.ENDMARKER)]
if all(x[0]==token.STRING for x in toks):
p.h = '__doc__'
elif all(x[0] == token.COMMENT for x in toks):
p.h = '...comments'
def split_root(root):
'''
Parses the text of the body and separates all
top level function definitions and class definitions
in separate nodes which are all direct children of
the root.
In the second phase, this function can be called on
each of the children with more than a certain threshold
number of lines.
'''
root.deleteAllChildren()
txt = root.b
lines = txt.splitlines(True)
def body(a, b):
return ''.join(lines[a-1:b and (b-1)])
nodes = find_node_borders(txt)
a, b, h = nodes[0]
root.b = f'{body(a, b)}@others\n{body(nodes[-1][0], None)}'
for a, b, h in nodes[1:-1]:
child = root.insertAsLastChild()
child.h = h
child.b = body(a, b)
if child.b.startswith('class ') and (b - a) > 20:
split_class(child)
if h == '...some declarations':rename(child)
def split_class(p):
lines = p.b.splitlines(True)
if len(lines) < 20: return
lws = [len(x) - len(x.lstrip()) for x in lines[1:] if x and not x.isspace()]
ind = min(lws)
def indent(x):
return ' '*ind + x
nlines = [x[ind:] if len(x) > ind else x for x in lines[1:]]
txt = ''.join(nlines)
nodes = find_node_borders(txt)
a, b, h = nodes[0]
def body(a, b):
return ''.join(nlines[a-1:b and (b-1)])
b1 = ''.join(lines[a:b]) + indent('@others\n')
a, b, h = nodes.pop()
b2 = ''.join(indent(x) for x in nlines[a-1:])
p.b = f'{lines[0]}{b1}{b2}'
for a, b, h in nodes[1:]:
child = p.insertAsLastChild()
child.h = h
child.b = body(a, b)
if h == '...some declarations':rename(child)
def import_py_file(p, fname):
with open(fname, 'r') as inp:
p.b = inp.read()
split_root(p)
return split_root, import_py_file
split_root, import_py_file = mk_py_importer()
del mk_py_importer
def import_one_level(fname):
'''
this function demonstrates usage of split_root function
it loads given python file in the test node and checks
to see if the import is perfect or not
'''
with open(fname, 'r') as inp:
txt = inp.read()
root = ensure_root(p, 'py import test node')
root.b = txt
split_root(root)
txt2 = g.getScript(c, root, useSentinels=False)
if txt != txt2:
g.es('differrent')
else:
g.es('same')
def ensure_root(p, name):
'''
this is just a utility for testing script
if there is no node in the outline with given name
this function will add a node and set its headline
to given name after the current position
'''
ps = c.find_h(name)
if not ps:
p1 = p.insertAfter()
p1.h = name
return p1
else:
return ps[0]
# you can choose whatever module here
# but for testing purposes we're
# going to import and parse difflib.py
# from the standard library
import difflib as module
import_one_level(module.__file__)
c.redraw()