Anybody interested in helping with this?
It scrapes an html files and converts into a tree hierarchy of web2py
helpers
'<div>xxx</div>' -> DIV('xxx')
It kind of works but fails at three exceptions described in the file.
Massimo
import re
from html import TAG
class HTMLParser(object):
otag = re.compile('</?[\w\:_]+',re.DOTALL)
itag = re.compile('\s+(?P<a>[\w\:_]+)(\s*=\s*(?P<b>\'.*?\'|".*?"|\S*))',re.DOTALL)
def __init__(self,text):
self.text=text
self.status='html'
self.tree=self.parent=TAG['']()
self.parse(text)
def parse(self,text):
while True:
if self.status=='html':
match = self.otag.search(text)
if not match:
print 'text:',text
self.parent.append(text)
return
else:
start = match.start()
stop = match.end()
self.parent.append(text[:start])
tagname = match.group()[1:]
if tagname[0]!='/':
tag=TAG[tagname]()
tag.parent=self.parent
istop=stop
stop=text[stop:].find('>')+stop+1
for k in self.itag.finditer(text[istop:stop-1]):
value = k.group('b')
if value and value[0] in ['"',"'"]:
tag['_'+k.group('a')]=value[1:-1]
elif value:
tag['_'+k.group('a')]=value
else:
tag['_'+k.group('a')]=True
print 'tag:',tag
self.parent.append(tag)
self.parent=tag
else:
self.parent=self.parent.parent
stop=text[stop:].find('>')+stop+1
text = text[stop:]
return
print HTMLParser('hello<div a="b" c=3>world<span>xxx</span>yyy</div>zzz').tree