See attachment
#!/usr/bin/env python
"""
Gleaning Resource Descriptions from Dialects of Languages (GRDDL)
That is, GRDDL provides a relatively inexpensive mechanism for bootstrapping RDF content from
uniform XML dialects; shifting the burden from formulating RDF to creating transformation
algorithms specifically for each dialect. XML Transformation languages such as XSLT are
quite versatile in their ability to process, manipulate, and generate XML. The use of XSLT
to generate XHTML from single-purpose XML vocabularies is historically celebrated as a powerful
idiom for separating structured content from presentation.
GRDDL shifts this idiom to a different end: separating structured content from its authoritative
meaning (or semantics). GRDDL works by associating transformations for an individual document,
either through direct inclusion of references or indirectly through profile and namespace documents.
See:
- http://4suite.org/docs/CoreManual.xml#xpath_query
- http://4suite.org/docs/CoreManual.xml#xslt_engine
- http://4suite.org/docs/CoreManual.xml#id1219140460 (for http://www.w3.org/2004/01/rdxh/spec#issue-base-param)
Copyright (c) 2006, Chimezie Ogbuji
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of inamidst.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import time, sys, urllib2, re, getopt
from sets import Set
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from pprint import pprint
from rdflib import Variable, BNode, URIRef, Literal, Namespace, RDF, RDFS
from rdflib.Collection import Collection
from rdflib.Graph import ConjunctiveGraph, QuotedGraph, ReadOnlyGraphAggregate, Graph
from rdflib.syntax.NamespaceManager import NamespaceManager
import Ft.Xml.Domlette
from Ft.Xml import Parse
from Ft.Xml.Xslt import Processor
from Ft.Xml import InputSource
from Ft.Lib.Uri import Absolutize
GRDDL_NS = Namespace("http://www.w3.org/2003/g/data-view#")
GRDDL_URI = u'http://www.w3.org/2003/g/data-view'
XHTML_NS = u"http://www.w3.org/1999/xhtml"
#Built-in list of namespace uri's that should terminate any recursive namespace dispatch
NSDispatchTermination = [XHTML_NS]
#Flag to determine whether or not to only attempt to parse a URL as XML if it's mime-type
#is appropriate
CHECK_XML_MIMETYPE = False
class Glean(object):
"""
Handles all the GRDDL XML parsing and XSLT transformation from URLs
"""
def __init__(self, url, graph, preParsedDOM=None):
"""
>>> g = Glean(u'http://www.w3.org/2003/g/po-doc.xml',Graph())
>>> g.dom.documentElement.localName
u'purchaseOrder'
"""
self.graph = graph
self.url = url
if preParsedDOM:
self.dom = preParsedDOM
else:
req = urllib2.Request(url)
try:
#peek in response headers to determine content-type
u = urllib2.urlopen(req)
self.headers = u.info()
if re.match(r'(?:text|application)/.*\+?xml', self.headers['content-type']) is None and CHECK_XML_MIMETYPE:
#What does the spec mandate about this scenario?
if DEBUG:
print "Ignoring non-xml information resource", url
print self.headers['content-type'].split(';')[0]
self.dom = None
else:
rt = u.read()
self.dom = Ft.Xml.Domlette.NonvalidatingReader.parseString(rt, url)
except Exception, e:
if DEBUG:
print "Unable to parse ", url, repr(e)
#Unable to glean. Fail gracefully..
self.dom = None
self.appliedTransforms = []
def transform(self, transformURLs):
"""
Takes a space seperated list of transform url's and applies them against thhe
pre-parsed DOM of the GRDDL source - making sure to avoid transformation already applied
"""
for xformURL in transformURLs.split():
if DEBUG:
print "applying transformation %s" % (xformURL)
if xformURL not in self.appliedTransforms:
self.appliedTransforms.append(xformURL)
#The transform url is resolved against the source URL (to accomodate relative urls)
transform = InputSource.DefaultFactory.fromUri(Absolutize(xformURL, self.url))
processor = Processor.Processor()
processor.appendStylesheet(transform)
#see: http://www.w3.org/TR/grddl/#stylepi
result = processor.runNode(self.dom, self.url, ignorePis=1)
#get output method / media-type
# <!-- Category: top-level-element -->
# <xsl:output
# method = "xml" | "html" | "text" | qname-but-not-ncname
# version = nmtoken
# encoding = string
# omit-xml-declaration = "yes" | "no"
# standalone = "yes" | "no"
# doctype-public = string
# doctype-system = string
# cdata-section-elements = qnames
# indent = "yes" | "no"
# media-type = string />
#How to accomodate @media-type?
method = processor.outputParams.method[-1]
currLen = len(self.graph)
if method == 'xml':
self.graph.parse(StringIO(result), publicID=self.url)
if DEBUG:
print "Parsed %s triples as RDF/XML"%(max(0,len(self.graph) - currLen))
elif method == 'text':
#Attempt a Notation 3 parse (covers NTriples, and Turtle)
self.graph.parse(StringIO(result), format='n3', publicID=self.url)
if DEBUG:
print "Parsed %s triples as Notation 3"%(max(0,len(self.graph) - currLen))
else:
#HTML result - recursive GRDDL mechanism?
raise Exception("unsupported output type")
class GRDDLAgent:
"""
The main entry point for the GRDDL agent
Takes a url and a graph to store the GRDDL result and attempts to 'glean'
in the 4 major ways that GRDDL mandates
"""
def __init__(self, url, graph):
if DEBUG:
print "Attempting a comprehensive glean of ", url
parsedSource = None
for gleanMethod in [XMLGlean, XMLNSGlean, XHTMLProfileGlean, ValidXHTMLGlean]:
#Don't reparse the GRDDL source
if not parsedSource:
gleaned = gleanMethod(url, graph)
parsedSource = gleaned.dom
elif parsedSource:
gleanMethod(url, graph, preParsedDOM=parsedSource)
class XMLGlean(Glean):
"""
http://www.w3.org/TR/grddl/#grddl-xml - Adding GRDDL to well-formed XML
The general form of associating a GRDDL transformation link with a well-formed
XML document is by adorning the root element with a grddl namespace declaration
and a grddl:transformation attribute whose value is a URI reference, or list
of URI references, that refer to executable scripts or programs which are
expected to transform the source document into RDF.
"""
def __init__(self, url, graph, preParsedDOM=None):
"""
>>> g = XMLGlean(u'http://www.w3.org/2003/g/po-ex', Graph())
>>> g.appliedTransforms[0]
u'http://www.w3.org/2003/g/embeddedRDF.xsl'
>>> pprint(list(g.graph))
[(u'http://www.w3.org/2003/g/po-ex',
u'http://www.w3.org/2003/g/data-view#namespaceTransformation',
u'http://www.w3.org/2003/g/grokPO.xsl')]
"""
super(XMLGlean, self).__init__(url, graph)
if self.dom:
self.xforms = [attr.value for attr in self.dom.xpath(u'/*/@data-view:transformation', {u'data-view':u'http://www.w3.org/2003/g/data-view#'})]
if self.xforms:
self.transform(self.xforms[0])
class XMLNSGlean(Glean):
"""
http://www.w3.org/TR/grddl/#ns-bind - Using GRDDL with XML Namespace Documents
Any resource available for retrieval from a namespace URI is a namespace document
(cf. section 4.5.4. Namespace documents in [WEBARCH]). For example, a namespace
document may have an XML Schema representation or an RDF Schema representation,
or perhaps both
To associate a GRDDL transformation with a whole dialect, have the namespace document
include the grddl:namespaceTransformation property.
* if an information resource ?D has an XML representation whose root element has a
namespace name ?NS then any GRDDL result of the resource identified by ?NS is a GRDDL
result of ?D
* if an information resource ?D has an XML representation whose root element has a
namespace name ?NSDOC** and ?D has a GRDDL result that includes, for any ?TX, the RDF
triple { ?NSDOC <http://www.w3.org/2003/g/data-view#namespaceTransformation> ?TX } then
?TX is also a transformation of ?D
"""
def __init__(self, url, graph, preParsedDOM=None):
"""
>>> g = XMLNSGlean(u'http://www.w3.org/2003/g/po-doc.xml', Graph())
>>> g.nsURI
u'http://www.w3.org/2003/g/po-ex'
>>> len(g.graph)
16
"""
super(XMLNSGlean, self).__init__(url, graph)
self.nsURI = None
if self.dom:
self.nsURI = self.dom.xpath(u'/*')[0].namespaceURI
if self.nsURI and self.nsURI not in NSDispatchTermination:
#glean GRDDL result from the namespace document
GRDDLAgent(self.nsURI, self.graph)
continueRecursion = True
#setup a set of processed transforms to avoid infinite namespace snooping cycles
processedNSXForms = Set()
#Recursively find 'new' namespace transformations
while continueRecursion:
todoXForms = Set()
for s, p, xform in self.graph.triples((self.nsURI, GRDDL_NS.namespaceTransformation, None)):
if xform not in processedNSXForms:
todoXForms.add(xform)
#continue only if we have xforms to apply
continueRecursion = bool(todoXForms)
#apply the new namespace transforms on the GRDDL source, merging the GRDDL results as we go
for newXForm in todoXForms:
self.transform(newXForm)
processedNSXForms.add(newXForm)
class ValidXHTMLGlean(Glean):
"""
http://www.w3.org/TR/grddl/#grddl-xhtml - Using GRDDL with valid XHTML
The general form of adding a GRDDL assertion to a valid XHTML document
is by specifying the GRDDL profile in the profile attribute of the head
element, and transformation as the value of the rel attribute of a link
or a element whose href attribute value is a URI reference that refers
to an executable script or program which is expected to transform the
source document into RDF. This method is suitable for use with valid
XHTML documents which are constrained by an XML DTD.
Stated more formally:
* An XHTML document whose metadata profiles include
http://www.w3.org/2003/g/data-view has a GRDDL transformation for each
resource identified by a link of type transformation.
"""
def __init__(self, url, graph, preParsedDOM=None):
super(ValidXHTMLGlean, self).__init__(url, graph)
if self.dom:
xhtmlNSMap = {u'xhtml':XHTML_NS}
for xform in self.dom.xpath(u'/xhtml:html[xhtml:head[contains(@profile, "%s")]]//xhtml:*[(local-name() = "a" or local-name() = "link") and @rel="transformation"]/@href'%GRDDL_URI, xhtmlNSMap):
self.transform(xform.value)
class XHTMLProfileGlean(Glean):
"""
http://www.w3.org/TR/grddl/#profile-bind - GRDDL for HTML Profiles
A more formal description on the relation between GRDDL and XHTML profiles follows:
* if an information resource ?D has an XHTML representation whose profile attribute
refers to ?PROFILE, then any GRDDL result of ?PROFILE is a GRDDL result of ?D
* if an information resource ?D has an XHTML representation whose profile attribute
refers to ?PROFILE and ?D has a GRDDL result that includes, for any ?TX, the RDF triple
{ ?PROFILE <http://www.w3.org/2003/g/data-view#profileTransformation> ?TX } then ?TX
is also a GRDDL transformation of ?D
"""
def __init__(self, url, graph, preParsedDOM=None):
"""
>>> g = XHTMLProfileGlean(u'http://www.w3.org/2003/g/data-view', Graph())
>>> g.profiles
>>> len(g.graph)
"""
super(XHTMLProfileGlean, self).__init__(url, graph)
self.profiles = []
if self.dom:
profile = self.dom.xpath(u'/xhtml:html/xhtml:head/@profile', {u'xhtml':XHTML_NS})
if profile:
self.profiles = profile[0].value.split()
for profile in self.profiles:
if profile == GRDDL_URI:
continue
if DEBUG:
print "processing profile url: ", profile
#glean GRDDL result from the profile document
GRDDLAgent(profile, self.graph)
continueRecursion = True
#setup a set of processed transforms to avoid infinite profile snooping cycles
processedProfileXForms = Set()
#Recursively find 'new' namespace transformations
while continueRecursion:
todoXForms = Set()
for s, p, xform in self.graph.triples((profile, GRDDL_NS.profileTransformation, None)):
if xform not in processedProfileXForms:
todoXForms.add(xform)
#continue only if we have xforms to apply
continueRecursion = bool(todoXForms)
#apply the new namespace transforms on the GRDDL source, merging the GRDDL results as we go
for newXForm in todoXForms:
self.transform(newXForm)
processedProfileXForms.add(newXForm)
OWL_NS = Namespace("http://www.w3.org/2002/07/owl#")
Any = None
LOG = Namespace("http://www.w3.org/2000/10/swap/log#")
def usage():
print "GRDDL.py [--test] [--help] [--debug] [--output-format=<'n3' or 'ntriples' or 'xml'>] [--ns=prefix=namespaceUri]* srcURL"
def main(argv):
global DEBUG
if argv is None: argv = sys.argv
try:
opts, args = getopt.getopt(argv[1:], "",
["help", "output-format=","debug","ns="])
except getopt.GetoptError,e:
print e
usage()
return 2
output = 'xml'
DEBUG=False
docTesting = False
nsBinds = {
'rdf' : RDF.RDFNS,
'rdfs': RDFS.RDFSNS,
'owl' : "http://www.w3.org/2002/07/owl#"
}
for o, a in opts:
if o == "--test":
docTesting = True
elif o == "--ns=":
pref,nsUri = a.split('=')
nsBinds[pref]=nsUri
elif o == "--debug":
DEBUG = True
elif o == "--help":
usage()
return 0
elif o == "--output-format":
output = a
if len(argv) < 2:
usage()
return 2
elif docTesting:
test()
return 2
graph = Graph()
namespace_manager = NamespaceManager(Graph())
for prefix,uri in nsBinds.items():
namespace_manager.bind(prefix, uri, override=False)
graph.namespace_manager = namespace_manager
g=GRDDLAgent(argv[-1], graph)
print graph.serialize(format=output)
def test():
import doctest
doctest.testmod()
if __name__ == '__main__':
main(sys.argv)
#!/usr/bin/env python
"""
Test harness for GRDDL test suite. Uses RDFLib to process test manifest and perform graph
isomorphism testing against expected output. Uses Sean B. Palmer's RDF Graph Isomorphism Tester
Copyright (c) 2006, Sean B. Palmer, Chimezie Ogbuji
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of inamidst.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
[ a e:Assertion; e:assertedBy [ foaf:homepage <http://www.w3.org/People/Connolly> ]; e:testSubject
<glean.py>; e:test [ = <grddl-wg/td/testlist1#sq1a>; a e:TestCase ]; e:testResult [ a e:TestResult; e:validity
e:pass ] ] .
"""
from pprint import pprint
from sets import Set
from rdflib.Namespace import Namespace
from rdflib import plugin,RDF,RDFS,URIRef,URIRef,Literal,Variable,BNode
from rdflib.store import Store
from cStringIO import StringIO
from rdflib.Graph import Graph,ReadOnlyGraphAggregate,ConjunctiveGraph
import sys, getopt
import os, tempfile
import urllib2
DC_NS = Namespace('http://purl.org/dc/elements/1.1/')
def compare(p, q):
return hash(Graph(p)) == hash(Graph(q))
class IsomorphicTestableGraph(Graph):
"""
Ported from http://www.w3.org/2001/sw/DataAccess/proto-tests/tools/rdfdiff.py
(Sean B Palmer's RDF Graph Isomorphism Tester)
"""
def __init__(self, **kargs):
super(IsomorphicTestableGraph,self).__init__(**kargs)
self.hash = None
def internal_hash(self):
"""
This is defined instead of __hash__ to avoid a circular recursion scenario with the Memory
store for rdflib which requires a hash lookup in order to return a generator of triples
"""
return hash(tuple(sorted(self.hashtriples())))
def hashtriples(self):
for triple in self:
g = ((isinstance(t,BNode) and self.vhash(t)) or t for t in triple)
yield hash(tuple(g))
def vhash(self, term, done=False):
return tuple(sorted(self.vhashtriples(term, done)))
def vhashtriples(self, term, done):
for t in self:
if term in t: yield tuple(self.vhashtriple(t, term, done))
def vhashtriple(self, triple, term, done):
for p in xrange(3):
if not isinstance(triple[p], BNode): yield triple[p]
elif done or (triple[p] == term): yield p
else: yield self.vhash(triple[p], done=True)
def __eq__(self, G):
"""Graph isomorphism testing."""
if not isinstance(G, IsomorphicTestableGraph): return False
elif len(self) != len(G): return False
elif list.__eq__(list(self),list(G)): return True # @@
return self.internal_hash() == G.internal_hash()
def __ne__(self, G):
"""Negative graph isomorphism testing."""
return not self.__eq__(G)
def runProcessor(processor,inputUri):
if DEBUG:
print "running: ", processor + " " + inputUri
return os.popen(processor + " " + inputUri,"r")
def updateTest(processor,inputUri,outputUri):
outputfilename = outputUri[outputUri.rfind('/')+1:]
print "Updating ",outputfilename
outputfile = open(outputfilename,"w")
output = runProcessor(processor,inputUri)
outputfile.write(output.read())
outputfile.close()
# Returns false when applying processor on inputUri differs from outputUri
def runTest(processor,inputUri,outputUri):
output = runProcessor(processor,inputUri)
outputfilename = outputUri[outputUri.rfind('/')+1:] + ".result"
if DEBUG:
print "Saving result to ",outputfilename
outputfile = open(outputfilename,"w")
outputfile.write(output.read())
outputfile.close()
expected = IsomorphicTestableGraph().parse(outputUri)
try:
actual = IsomorphicTestableGraph().parse(outputfilename)
except:
if DEBUG:
print "problems parsing result"
return False
if len(actual) != len(expected):
if DEBUG:
print "Unequal lengths: expected = %s actual = %s"%(len(expected),len(actual))
rt = actual == expected
if DEBUG and actual != expected:
print "Missing: "
pprint(list(Set(list(expected)).difference(list(actual))))
return rt
def process(action,uri,processor):
data = Graph()
data.parse(uri)
nsMapping = {
u'test':Namespace('http://www.w3.org/2000/10/rdf-tests/rdfcore/testSchema#'),
u'dc':DC_NS
}
hasFailure = 0
for descr,test,input,output in data.query("SELECT ?desc ?t ?i ?o WHERE { ?t test:inputDocument ?i. ?t a test:Test . ?t dc:title ?desc. ?t test:outputDocument ?o }",initNs=nsMapping):
if DEBUG:
print "###", descr, "###"
print "\t",input
if action=="update":
updateTest(processor,input,output)
elif action=="run":
if not runTest(processor,input,output):
print "* %s failed" % test
hasFailure = 1
if not hasFailure and action=="run":
print "All tests were passed!"
def main(argv=None):
if argv is None: argv = sys.argv
try:
opts, args = getopt.getopt(argv[1:], "dhr:u:",
["help", "run=", "update=","debug"])
except getopt.GetoptError:
usage()
return 2
processor = None
action = None
tests = None
global DEBUG
DEBUG=0
for o, a in opts:
if o in ("-d", "--debug"):
DEBUG = 1
if o in ("-h", "--help"):
usage()
return 0
if o in ("-r", "--run"):
processor = a
action = "run"
if o in ("-u", "--update"):
processor = a
action = "update"
if not (processor and action and len(args) == 1):
usage()
return 2
# URI of the list of tests
tests = args[0]
process(action,tests,processor)
return 0
def usage():
print __doc__
print __version__
if __name__ == '__main__':
sys.exit(main())