I'm trying to write an example of how to move elements around in a document using ElementTree. The objective is to take things like this:

    <em><h1>Heading</h1></em>

and turn them into:

    <h1><em>Heading</em></h1>

i.e., put the emphasis inside the h1-h4 elements, instead of the other way around.

It's almost working, but I'm still having trouble handling nodes whose children are interspersed text and elements.

The script below shows the problem. Run it from the command line with
no argument, and it'll break on the 'Single Inversion' text. Run it again with 'movetext' as its only argument, and it'll break on the last
test (in which the body element has strings, em+heading, and other elements as children).


I think the root of my problem is that I don't understand how ElementTree stores text --- if you have:

    <p> a <b>c</b> d <e> f <g/></e> </p>

then what are p's children? What is p.text? What happens if you assign a new value to p.text?

Thanks,
Greg

(Note: if your news reader breaks the 'Mixed Content' and 'Nested' test cases across lines, you may have to edit them.)

import sys
from cElementTree import Element, fromstring, tostring

# from visitor import Visitor
class Visitor(object):

    def __init__(self):
        pass

    def visit(self, root):
        self.beforeAll(root)
        self.traverse(root)
        self.afterAll(root)

    def traverse(self, current):
        self.beforeNode(current)
        self.atNode(current)
        for child in current:
            self.traverse(child)
        self.afterNode(current)

    def doNothing(self, node):
        pass

    beforeAll = doNothing
    afterAll = doNothing
    beforeNode = doNothing
    afterNode = doNothing
    atNode = doNothing

HeadingTags = ('h1', 'h2', 'h3', 'h4')

def containsOnlyHeading(node):
    '''Does a node contain only a single heading?'''
    return (len(node) == 1) and \
           (node[0].tag in HeadingTags)

class Finder(Visitor):
    '''Locate all nodes in a tree that have emphasized nodes containing
    a single heading as children.'''

    def beforeAll(self, root):
        self.nodes = []

    def atNode(self, node):
        for child in node:
            if (child.tag == 'em') and containsOnlyHeading(child):
                self.nodes.append(node)
                return

def transform(parent):
'''Transform a node that has emphasized children containing headings.'''


    print '..parent', tostring(parent)

    # Helper function to locate a child in a parent.
    def findIndex(parent, child):
        for i in range(len(parent)):
            if parent[i] is child:
                return i
        return -1

    # Get all emphasized nodes, and filter to get the ones to be modified.
    allEmph = parent.findall('em')
    allEmph = [x for x in allEmph if containsOnlyHeading(x)]
    assert allEmph

    # Transform each in turn.
    for emph in allEmph:

        print '....emph', tostring(emph)

        # Get the heading.
        assert len(emph) == 1
        heading = emph[0]
        assert heading.tag in HeadingTags

        print '....heading', tostring(heading)

        # Take the heading out of the emphasized node.
        emph.remove(heading)

        print '....after removing heading, emph is', tostring(emph)

        # Put the heading in the parent in the emphasized node's place.
        loc = findIndex(parent, emph)
        assert loc >= 0
        parent[loc] = heading

print '....after putting heading in emph place, parent is', tostring(parent)

# Move the heading's children and text to the emphasized node.
if 'movetext' in sys.argv[1:]:
emph.text = heading.text
heading.text = None
print '....after moving text, heading is', tostring(heading), 'and emph is', tostring(emph)
else:
print '....not moving text'
while len(heading):
child = heading[0]
emph.append(child)
heading.remove(child)
print '......after moving', tostring(child), 'emph is', tostring(emph), 'and heading is', tostring(heading)


# Make the emphasized node the heading's only child.
heading.append(emph)
print 'after attaching emph to heading, heading is', tostring(heading)


def normalize(root):
    '''Normalize an entire document.'''
    f = Finder()
    f.visit(root)
    for node in f.nodes:
        transform(node)

if __name__ == '__main__':

    tests = (
        ('Empty',
         '<empty />',
         '<empty />'),

        ('Single',
         '<single><child /></single>',
         '<single><child /></single>'),

        ('Em Only',
         '<html><em>unchanged</em></html>',
         '<html><em>unchanged</em></html>'),

        ('H1 Only',
         '<html><h1>unchanged</h1></html>',
         '<html><h1>unchanged</h1></html>'),

        ('Already Normalized',
         '<html><h1><em>unchanged</em></h1></html>',
         '<html><h1><em>unchanged</em></h1></html>'),

        ('Single Inversion',
         '<html><em><h1>changed</h1></em></html>',
         '<html><h1><em>changed</em></h1></html>'),

('Mixed Content',
'<html><em><h1><b>change</b> this <b>and</b> that</h1></em></html>',
'<html><h1><em><b>change</b> this <b>and</b> that</em></h1></html>'),


('Nested',
'<html><body><em><h2>x</h2></em> <p>space</p> <em><h3>y</h3></em> space</body></html>',
'<html><body><h2><em>x</em></h2> <p>space</p> <h3><em>y</em></h3> space</body></html>')
)


    for (name, input, expected) in tests:
        print name
        print 'INPUT', input
        doc = fromstring(input)
        normalize(doc)
        actual = tostring(doc)
        print 'EXPECTED', expected
        print 'ACTUAL', actual
        print
        assert actual == expected

_______________________________________________
XML-SIG maillist  -  XML-SIG@python.org
http://mail.python.org/mailman/listinfo/xml-sig

Reply via email to