[web2py] parsehtml

Massimo Di Pierro Sun, 23 May 2010 08:46:16 -0700

Anybody interested in helping with this?

It scrapes an html files and converts into a tree hierarchy of web2pyhelpers


'<div>xxx</div>' -> DIV('xxx')

It kind of works but fails at three exceptions described in the file.

Massimo

import re
from html import TAG

class HTMLParser(object):
    otag = re.compile('</?[\w\:_]+',re.DOTALL)
    itag = re.compile('\s+(?P<a>[\w\:_]+)(\s*=\s*(?P<b>\'.*?\'|".*?"|\S*))',re.DOTALL)
    def __init__(self,text):
        self.text=text
        self.status='html'
        self.tree=self.parent=TAG['']()
        self.parse(text)
    def parse(self,text):
        while True:
            if self.status=='html':
                match = self.otag.search(text)
                if not match:
                    print 'text:',text
                    self.parent.append(text)
                    return
                else:
                    start = match.start()
                    stop = match.end()
                    self.parent.append(text[:start])                    
                    tagname = match.group()[1:] 
                    if tagname[0]!='/':
                        tag=TAG[tagname]()                        
                        tag.parent=self.parent
                        istop=stop
                        stop=text[stop:].find('>')+stop+1
                        for k in self.itag.finditer(text[istop:stop-1]):
                            value = k.group('b')
                            if value and value[0] in ['"',"'"]:
                                tag['_'+k.group('a')]=value[1:-1]
                            elif value:
                                tag['_'+k.group('a')]=value
                            else:
                                tag['_'+k.group('a')]=True
                        print 'tag:',tag
                        self.parent.append(tag)
                        self.parent=tag
                    else:
                        self.parent=self.parent.parent
                        stop=text[stop:].find('>')+stop+1 
                    text = text[stop:]
        return
                

print HTMLParser('hello<div a="b" c=3>world<span>xxx</span>yyy</div>zzz').tree

[web2py] parsehtml

Reply via email to