Hi,
maybe you are interested in a little (let's say) XML Preprocessor I've
written during the last days. The purpose is to ease the pain in writing XML
style documents with an editor.
Chuck suggested www.yaml.org, but they seem to give more than I bargained
for.
To make it short, some thing like
----------------------------------------
html
head
body
h1 """Header"""
----------------------------------------
will be translated to
----------------------------------------
<html>
<head></head>
<body>
<h1>Header</h1>
</body>
</html>
----------------------------------------
All this stuff is not bulletproof yet, but the file, that contains the
parser (px.py) has some docstring available.
px2xml.py gives you a translater px -> xml
and, for convinience,
xml2px translates xml/xhtml into px notation.
I'd be happy, to hear back from you about this.
-- Stephan
_________________________________________________________________________
Get Your Private, Free E-mail from MSN Hotmail at http://www.hotmail.com.
'''
px (python xml) has only one purpose: ease the pain when writing xml/html
documents with a text editor.
This is done with:
1. no "END" tags
2. using indentation for blocks
Text has to be quoted in triple '"' and can be placed directly after a tag.
A "/" as the last character in a tag name denotes an empty element.
Here is a short example:
--- px text ---
html
head
body(bgcolor="white") """This is some body text"""
br/
-----------------
is equivalent to
--- html text ---
<html>
<head></head>
<body bgcolor="white">This is some body text
<br/>
</body>
</html>
-----------------
SHORTCOMINGS:
-Only '\t' can be used for indentation.
-The parser is extreamly trusting that the text is valid
TODO:
-more testing
-usefull errormessages
-use other platforms
-use other form of WS as well for indentation
-document the code
-make interface SAX complient
IDEAS:
-include aliases. If you had a <this-is-an-extreamly-long-tag> tag, you
could say:
"alias this-is-an-extreamly-long-tag t1"
(or #define or python dictonary notation?)
--------------------------------------------------------------------------
(c) Copyright by Stephan Diehl, 2000 (mailto:[EMAIL PROTECTED])
Permission to use, copy, modify, and distribute this software and its
documentation for any purpose and without fee or royalty is hereby
granted,
provided that the above copyright notice appear in all copies and that
both that copyright notice and this permission notice appear in
supporting documentation or portions thereof, including modifications,
that you make.
THE AUTHORS DISCLAIM ALL WARRANTIES WITH REGARD TO
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS, IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL,
INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
WITH THE USE OR PERFORMANCE OF THIS SOFTWARE !
'''
import sys, re
START = 1
STOP = 2
TEXT = 3
SHORT = 4
EMPTY = 5
TOKEN = 11
DATA = 12
class px:
textline = re.compile(r'(\t*)"""(.*)"""(.*)')
reg = re.compile(r'(\t*)([a-zA-Z0-9:/\-]+)(\(.*\))?(.*)')
atreg = re.compile(r'(.*?)=((".*?")|(.*?))( (.*))?')
def __init__(self):
self.StartElementHandler = None
self.EndElementHandler = None
self.DataElementHandler = None
self.EmptyElementHandler = None
def Parse(self,indat):
if not self.StartElementHandler:
raise "no StartElementHandler"
if not self.EndElementHandler:
raise "no EndElementHandler"
if not self.DataElementHandler:
raise "no DataElementHandler"
if not self.EmptyElementHandler:
raise "no EmptyElementHandler"
tokenlist = self._tokenize(indat)
for element in tokenlist:
if element[0] == START:
self.StartElementHandler(element[1],element[2])
elif element[0] == STOP:
self.EndElementHandler(element[1])
elif element[0] == TEXT:
self.DataElementHandler(element[1])
elif element[0] == EMPTY:
self.EmptyElementHandler(element[1],element[2])
def _parseAttr(self,string):
attrs = {}
string = string.strip()
while string:
m = self.atreg.match(string)
if not m:
raise "Error while parsing attributes",string
attrs[m.group(1)] = m.group(2)
if m.group(5):
string = m.group(5).strip()
else:
string = ''
return attrs
def _lexline(self,line):
m = self.reg.match(line)
if not m:
raise "match error"
return len(m.group(1)), m.group(2), m.group(3), m.group(4)
def _tokenize(self,indat):
state = TOKEN
data = ''
tokenList = []
tstack = []
linenum = 0
for line in indat.split('\n')[:-1]:
linenum += 1
attrs = {}
if state == TOKEN:
if not line.strip():
continue
m = self.textline.match(line)
if m:
numtab = m.group(1)
txt = m.group(2)
while numtab < len(tstack):
tok = tstack.pop()
tokenList.append((STOP,tok))
tokenList.append((TEXT,txt))
continue
try:
numtab,element,attr,txt = self._lexline(line)
except:
print "parse error in line %d" % linenum
raise "match error"
if not attr: attr = ''
else:
attrs = self._parseAttr(attr[1:-1])
# get level
while numtab < len(tstack):
tok = tstack.pop()
tokenList.append((STOP,tok))
if numtab == len(tstack):
if element[-1] == '/':
tokenList.append((EMPTY,element[:-1],attrs))
else:
tstack.append(element)
tokenList.append((START,element,attrs))
if txt:
splittext = txt.strip().split('"""')
if len(splittext) == 1:
tokenList.append((TEXT,splittext[0]))
elif len(splittext) == 2:
data += splittext[1]+'\n'
state = DATA
elif len(splittext) == 3:
tokenList.append((TEXT,splittext[1]))
else:
raise "parse error in line %d"
% linenum
elif numtab > len(tstack):
raise "wrong indentation in line
%d:\nstack:%s\nTokList:%s" %
(linenum,`tstack`,`tokenList`)
elif state == DATA:
splitline = line.strip().split('"""')
if len(splitline) == 1:
data = data + splitline[0] + '\n'
elif len(splitline) == 2:
data = data + splitline[0]
state = TOKEN
tokenList.append((TEXT,data[:]))
data = ''
else:
pass
else:
pass
while len(tstack):
tokenList.append((STOP,tstack.pop()))
return tokenList
if __name__ == '__main__':
testData = '''html
head
body(a="x" c="hallo") """first text."""
empty1/(b="c") """some text"""
table
tr
td """here comes some
text inside the td tag"""
td
tr
td
td
'''
def start(el,attrs):
print 'START %s %s' % (el,`attrs`)
def stop(el):
print 'STOP %s' % el
def data(dat):
print 'DATA %s' % dat
def empty(el,attrs):
print 'EMPTY %s %s' % (el,`attrs`)
h = px()
h.StartElementHandler = start
h.EndElementHandler = stop
h.DataElementHandler = data
h.EmptyElementHandler = empty
h.Parse(testData)
import px,sys
MAXLENGTH = 20
START = 1
STOP = 2
DATA = 3
EMPTY = 4
class parseObject:
def __init__(self,fout=sys.stdin):
self.outStream = fout
self.tstack = []
self.newline = 1
self.lastel = 0
def start(self,el,attrs):
attext = ''
if attrs:
for k,v in attrs.items():
attext += ' %s=%s' % (k,v)
self.lastel = START
self.outStream.write('\n%s<%s%s>' % (len(self.tstack)*'\t',el,attext))
self.tstack.append(0)
def stop(self,el):
self.tstack.pop()
if self.newline and (self.lastel != START):
filler = len(self.tstack)*'\t'
self.outStream.write('\n%s' % filler)
self.outStream.write('</%s>' % el)
self.newline = 1
self.lastel = STOP
def data(self,dat):
length = len(dat)
self.outStream.write(dat)
if (length < MAXLENGTH) and self.lastel != EMPTY:
self.newline = 0
else:
self.outStream.write('\n')
self.lastel = DATA
def empty(self,el,attrs):
attext = ''
if attrs:
for k,v in attrs.items():
attext += ' %s=%s' % (k,v)
self.outStream.write('\n%s<%s%s/>' % (len(self.tstack)*'\t',el,attext))
self.newline = 1
self.lastel = EMPTY
if __name__ == '__main__':
import getopt
xmltext = '''<?xml version="1.0" encoding="ISO_8859-1"?>
'''
xsltext = '''<?xml version="1.0"?>
<!DOCTYPE xsl:stylesheet [
<!ENTITY nbsp " ">
]>
'''
htmltext = ''''''
helptext = '''USAGE: px2xml [[-sxwh] [-i <infile>] [-o <outfile>] -t
<text>]
Options:
-s : output has xsl stylesheet prefix
-x : output has generic xml prefix
-w : output has generic html prefix
-t <text> : between prefix and document <text> will be included
-i <file> : read data from <file>. if missing, stdin will be taken
-o <file> : write to <file>. If missing, stdout will be taken
-h : print this helpfile
'''
optlist, args = getopt.getopt(sys.argv[1:],'i:o:sxwht:')
fi = sys.stdin
fo = sys.stdout
addtext = ''
preamble = ''
for opt,val in optlist:
if opt == '-i':
fi = open(val,'r')
if opt == '-o':
fo = open(val,'w')
if opt == '-x':
preamble = xmltext
if opt == '-s':
preamble = xsltext
if opt == '-w':
preamble = htmltext
if opt == '-h':
print helptext
sys.exit()
if opt == '-t':
addtext = val
fo.write(preamble)
fo.write(addtext)
px2xml = parseObject(fo)
p = px.px()
p.StartElementHandler = px2xml.start
p.EndElementHandler = px2xml.stop
p.DataElementHandler = px2xml.data
p.EmptyElementHandler = px2xml.empty
p.Parse(fi.read())
from xml.parsers import expat
import sys
START = 1
STOP = 2
DATA = 3
class Parser:
def __init__(self,outstream = sys.stdout):
self.tstack = []
self.lastel = 0
self.last = ()
self.out = outstream
self._parser = expat.ParserCreate()
self._parser.StartElementHandler = self.start
self._parser.EndElementHandler = self.end
self._parser.CharacterDataHandler = self.data
self._parser.ProcessingInstructionHandler = self.pi
def feed(self,data):
self._parser.Parse(data,0)
def close(self):
self._parser.Parse('',1)
del self._parser
def mystart(self,tag,attrs):
if attrs:
att = ''
for key in attrs.keys():
att += ' %s="%s"' % (key,attrs[key])
self.out.write('\n%s%s(%s)' % ('\t'*len(self.tstack),tag,att))
else:
self.out.write('\n%s%s' % ('\t'*len(self.tstack),tag))
self.tstack.append(tag)
def start(self,tag,attrs):
if self.last and self.last[0] == START:
self.mystart(self.last[1],self.last[2])
self.last = (START,tag,attrs)
def end(self,tag):
if self.last and self.last[0] == START:
tag,attrs = self.last[1]+'/',self.last[2]
self.mystart(tag,attrs)
self.last = (STOP,tag)
self.tstack.pop()
def data(self,data):
if self.last and self.last[0] == START:
self.mystart(self.last[1],self.last[2])
data = data.strip()
# if len(data) > 30 or (self.last and (self.last[0] == STOP\
# or self.last[0] == DATA)):
if data:
self.out.write('\n'+len(self.tstack)*'\t')
self.out.write('"""'+data+'"""')
self.last = (DATA,)
def pi(self,target,dat):
dat = '<?'+target+' '+dat+'?>'
self.data(dat)
if __name__ == '__main__':
import getopt
optlist, args = getopt.getopt(sys.argv[1:],'i:o:')
fi = sys.stdin
fo = sys.stdout
for opt,val in optlist:
if opt == '-i':
fi = open(val,'r')
if opt == '-o':
fo = open(val,'w')
p = Parser(outstream = fo)
p.feed(fi.read())
p.close()