Alexandro,
In case this can help you, Here an example of a SAX parsing python script
using 'zipfile' and 'xml.sax'
It works fine with OpenOffice and Word XML/Word docx formats.
I do not know what would be the benefits of using the PyUNO module, but I
would be pleased to learn more on this.
Laurent
#!/usr/bin/python
# -*- coding: latin-1 -*-
#-----------------------------------------------------------------------------
# @(#) <TRAMway> converter for WordML (XML and docx) an OpenOffice.org
file
#-----------------------------------------------------------------------------
"""
Three mode usage are available
1/ Command line; use options and arguments following the getopt standard
2/ CGI, install the script on a Web server
3/ Python API, call API from any other code
API:
xml2tr.convert(option_tag,argument_tab)
an option element is a couple (option name, option value)
Type -h option for option list
"""
__version__ = '$Id: xml2tr.py 40 2007-01-08 16:26:04Z saintax $'
from xml.sax import make_parser, handler, saxutils, parseString
import string
import sys
import re
import os
import md5
import getopt # command line
import cgi # web
import zipfile
def usage():
op = {'h':['help', 'This text', False],
'v':['version', 'Display Tool Version', False],
'o':['output=', 'Ouput in a file instead of standard output',
False],
}
sys.stderr.write("""USAGE: %s [options] input_file...\n
options:
-i string
--identification string
\tDefines the regular expression matching the name of the paragraph style
to capture Requirement Identification.
\tDefault string is 'reqidentification'\n
-d string
--description string
\tDefines the regular expression matching the name of the paragraph style
to capture Requirement Description.
\tDefault string is 'reqdescription'\n
-r string
--reference string
\tDefines the regular expression matching the name of the paragraph style
to capture Requirement Reference.
\tDefault string is 'reqreference'\n
-a string
--attribute string
\tDefines the regular expression matching the name of the paragraph style
to capture Requirement Reference.
\tDefault string is 'reqreference'\n
-p string
--prefix string
\tAdd a prefix string to all detected requirement ids.
\tDefault string is empty\n
-o string
--output string
\tSpecifies an output file name.
\tDefault is the standard output.\n
Note:
\nregular expression is not case sensitive and style name can have prefix
or suffix
input_file:
\tWord 2003 XML or Opendocument file format (WordML)\n"""%(sys.argv[0]))
sys.exit()
#__________#
class param:
def __init__(self,val):
self.value = val
#____________________#
class cmdFieldStorage:
def __init__(self):
self.dico = {}
def has_key(self, key):
return self.dico.has_key(key)
def add(self,key,val):
self.dico[key] = param(val)
def addlist(self,key,l):
self.dico[key] = l
def __getitem__(self, key):
return self.dico[key]
def getlist(self, key):
if self.dico.has_key(key):
return self.dico[key]
else:
return []
#__________________#
def header_appli(sum):
return """<?xml version="1.0" encoding="iso-8859-1" standalone="yes"?>
<?topcased-application progid="TRAMway"?>
<?xml-stylesheet href="neutral.css" type="text/css"?>
<tramway xmlns="http://www.w3.org/1999/xhtml" md5="%s">
<title>TRAMway converter [%s]
%s</title>\n"""%(sum,os.path.basename(sys.argv[0]),__version__)
#__________________#
def footer_appli():
return '</tramway>\n'
#_______________________________________#
class docHandler(handler.ContentHandler):
def __init__(self,dico,fdo):
self.intext = False
self.buffer = ''
self.style = ''
self.id = ''
self.sd = ''
self.seen_id = {}
self.reflist = ''
self.desc = ''
self.fo = dico
self.keywords = '([:
#\\\/\^\$,;\?\!\&[EMAIL
PROTECTED])\(\{\}\=\+\|]|delete|derived|double|known|bad_id|covered|refined|resolved)'
self.section = ''
self.level = {}
#self.secid = []
self.attribute = {}
self.o = fdo
self.tl = {}
self.auto = {}
def startDocument(self):
#print self.fo['i'].value
pass
def endDocument(self):
self.record()
self.newsection(1,True)
def characters(self,str):
if self.intext:
str = saxutils.escape(str)
self.buffer += str.encode('latin-1','replace')
def startElement(self, name, attrs):
if name == 'style:style':
Pauto = attrs.get('style:name').encode('latin-1')
if re.match(r'P\d+', Pauto):
self.auto[Pauto] =
attrs.get('style:parent-style-name').encode('latin-1')
if name == 'text:p':
self.intext = True
self.buffer = ''
self.style = attrs.get('text:style-name').encode('latin-1')
if self.auto.has_key(self.style):
self.style = self.auto[self.style]
if name == 'w:t':
self.intext = True
elif name == 'w:p':
self.style = 'none'
self.buffer = ''
elif name == 'w:pStyle':
self.style = attrs.get('w:val').encode('latin-1')
elif name == 'w:style':
self.section = attrs.get('w:styleId').encode('latin-1')
elif name == 'w:outlineLvl':
self.level[self.section] =
int(attrs.get('w:val').encode('latin-1'))+1
def endElement(self, name):
self.intext = False
self.buffer = string.strip(self.buffer)
if ((name == 'w:p') or (name == 'text:p')) and self.buffer:
if re.match(r'.*%s'%(self.fo['i'].value), self.style,
re.IGNORECASE):
self.record() # record last requirement
# new req
self.desc = ''
mat = re.match(r'^\s*(.+)\s*:\s*(.+)\s*$',self.buffer)
self.id = self.fo['p'].value
if mat:
self.id += mat.group(1)
self.sd = '
shortdescription=%s'%(saxutils.quoteattr(mat.group(2)))
else:
self.id += self.buffer
self.sd = ''
if self.seen_id.has_key(self.id):
sys.stderr.write('ERROR:DOUBLE:%s'%(self.id))
self.id += '_DEFINED_SEVERAL_TIMES'
else:
self.seen_id[self.id] = True
elif re.match(r'.*%s'%(self.fo['d'].value), self.style,
re.IGNORECASE):
self.desc += '\n' + self.buffer
elif re.match(r'.*%s'%(self.fo['r'].value), self.style,
re.IGNORECASE):
if self.reflist:
self.reflist += '|' + self.buffer
else:
self.reflist = self.buffer
elif re.match(r'.*%s'%(self.fo['a'].value), self.style,
re.IGNORECASE):
mat = re.match(r'^\s*(.+)\s*:\s*(.+)\s*$',self.buffer)
if mat:
self.attribute[mat.group(1)] = mat.group(2)
elif self.level.has_key(self.style):
self.newsection(self.level[self.style])
def newsection(self,lvl,end=False):
for i in self.tl.keys():
if i >= lvl:
self.o[0] += '</section>\n'
del self.tl[i]
self.tl[lvl] = True
if not end:
#self.o[0] += '<section level="%d"
shortdescription=%s>\n'%(lvl,saxutils.quoteattr(self.buffer))
self.o[0] += '<section
shortdescription=%s>\n'%(saxutils.quoteattr(self.buffer))
def record(self):
if re.match(r'.*#.*'+self.keywords+'.*', self.id, re.IGNORECASE):
sys.stderr.write('ERROR:BAD_ID:%s'%(self.id))
elif self.id and re.match(r'^[\s\w\d_-]*$', self.id,
re.IGNORECASE):
self.o[0] += '\n<requirement
id=%s%s>\n'%(saxutils.quoteattr(self.id),self.sd)
self.o[0] += self.desc + '\n'
if self.attribute:
for i in self.attribute.keys():
self.o[0] += '<attribute name=%s
value=%s/>\n'%(saxutils.quoteattr(i),saxutils.quoteattr(self.attribute[i]))
if self.reflist:
for i in self.reflist.split('|'):
self.o[0] += '<reference type="cover" id="%s"/>\n'
%(i)
self.o[0] += '</requirement>\n'
self.id = ''
self.desc = ''
self.reflist = ''
self.attribute = {}
#__________________#
def get_cgi_param():
return """<form enctype="multipart/form-data" method="post">
<p> Input file <input type="file" name="arg"/></p>
<p> Option -i <input name="i" value="reqidentification"/></p>
<p> Option -d <input name="d" value="reqdescription"/></p>
<p> Option -r <input name="r" value="reqreference"/></p>
<p> Option -a <input name="a" value="reqattribute"/></p>
<input type="submit" value="send"/>
</form>"""
#____________________#
def convert(opts,str):
op = {'h':['help', 'This text', False],
'v':['version', 'Display Tool Version', False],
'w':['web', 'HTML format output instead of text', False],
'o':['output=', 'Ouput in a file instead of standard output',
False],
}
sys.stderr = sys.stdout
acgi = False
if False and os.environ.has_key('HTTP_USER_AGENT') and (opts==[]):
acgi = True
fo = cgi.FieldStorage()
cgi_header = 'Content-type: application/xhtml+xml'
else:
fo = cmdFieldStorage()
fo.add('i','(reqidentification|id)')
fo.add('d','(reqdescription|requirement)')
fo.add('r','reqreference')
fo.add('a','reqattribute')
fo.add('p','')
fd = sys.stdout
for o, a in opts:
if o in ('-h', '--help'):
usage()
elif o in ('-v', '--version'):
print '%s v %s'%(sys.argv[0],__version__)
sys.exit()
elif o in ('-i', '--identification'):
fo['i'].value = a
elif o in ('-d', '--description'):
fo['d'].value = a
elif o in ('-r', '--reference'):
fo['r'].value = a
elif o in ('-a', '--attribute'):
fo['a'].value = a
elif o in ('-p', '--prefix'):
fo['p'].value = a
elif o in ('-o', '--output'):
fd = open (a,'w')
sys.stdout = fd
sum = md5.new()
sum.update(str)
out = ''
if acgi:
out += cgi_header+'\n\n'
out += header_appli(sum.hexdigest())
if acgi and (not fo.has_key('i')):
out += get_cgi_param()
#if fo.has_key('arg'):
# args.append(fo['arg'].file)
testout = ['']
if str:
dh = docHandler(fo,testout)
parseString(str,dh)
out += testout[0]
out += footer_appli()
if fd != sys.stdout:
fd.close()
return out
#_________________________#
def convert_zip(opts,f,content):
zip = zipfile.ZipFile(f)
print convert(opts,zip.read(content))
zip.close()
#________________________#
if __name__ == '__main__':
try:
opts, args = getopt.getopt(\
sys.argv[1:], 'hi:d:r:a:p:o:', \
['help',
'identification=','description=','reference=','attribute=','prefix','output='])
except getopt.GetoptError:
usage()
str,typ = '',''
if (len(args) == 1) and os.path.isfile(args[0]):
if args[0][-4:] == '.odt':
convert_zip(opts,args[0],'content.xml')
elif args[0][-5:] == '.docx':
convert_zip(opts,args[0],'word/document.xml')
else:
str = open(args[0]).read()
mat = re.match(r'.*progid=(.*)\?>.*$',str.split('\n')[1])
if mat:
typ = mat.group(1)[1:-1]
if typ == 'Word.Document':
sys.stderr.write('word')
print convert(opts,str)
elif typ == 'TRAMway':
sys.stderr.write('TRAMway')
print str
else:
sys.stderr.write('ERROR Not supported format')
# Test
# example of API call without using getopt
#print
xml2tr.convert([('-i','requirementId'),('-d','requirementText')],[sys.argv[1]])
Laurent
Alexandro Colorado <[EMAIL PROTECTED]>
29/03/2007 20:11
Veuillez répondre à
[email protected]
A
[email protected]
cc
Objet
[xml-dev] PyUNO Sax Module
So I am working on a script that parse ODF's XML. I have no problem
running it on Python 2.4 (my system python). However I cant seem to
work it on Pyuno (OOo's Python).
The script uses saxutils from the sax module. I made a diff on both
modules and couldnt find much difference. I wonder if anyone here have
used python's sax module and if they are able to run the
'DefaultHandler' class from the saxutils library.
I have googled and found similar people with the problem. I found that
ContentHandler might be an alternative. I am also reading the
doucmentation however I can't see to find anything related with the
DefaultHandler. Maybe I am looking somewhere else, anywhere here is
the link to those modules:
PyUNO: http://www.python.org/doc/2.3.4/lib/module-xml.sax.html
Python2.4: http://www.python.org/doc/2.4.3/lib/module-xml.sax.html
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]