GRDDL implementation and testHarness port

Chimezie Ogbuji Thu, 16 Nov 2006 15:11:30 -0800

See attachment

#!/usr/bin/env python
"""
Gleaning Resource Descriptions from Dialects of Languages (GRDDL)


That is, GRDDL provides a relatively inexpensive mechanism for bootstrapping RDF content from
uniform XML dialects; shifting the burden from formulating RDF to creating transformation 
algorithms specifically for each dialect. XML Transformation languages such as XSLT are 
quite versatile in their ability to process, manipulate, and generate XML. The use of XSLT 
to generate XHTML from single-purpose XML vocabularies is historically celebrated as a powerful 
idiom for separating structured content from presentation.

GRDDL shifts this idiom to a different end: separating structured content from its authoritative 
meaning (or semantics). GRDDL works by associating transformations for an individual document, 
either through direct inclusion of references or indirectly through profile and namespace documents. 

See: 
- http://4suite.org/docs/CoreManual.xml#xpath_query
- http://4suite.org/docs/CoreManual.xml#xslt_engine
- http://4suite.org/docs/CoreManual.xml#id1219140460 (for http://www.w3.org/2004/01/rdxh/spec#issue-base-param)

Copyright (c) 2006, Chimezie Ogbuji
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the name of inamidst.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import time, sys, urllib2, re, getopt
from sets import Set
try:
    from cStringIO import StringIO
except ImportError:        
    from StringIO import StringIO
from pprint import pprint
from rdflib import Variable, BNode, URIRef, Literal, Namespace, RDF, RDFS
from rdflib.Collection import Collection
from rdflib.Graph import ConjunctiveGraph, QuotedGraph, ReadOnlyGraphAggregate, Graph
from rdflib.syntax.NamespaceManager import NamespaceManager
import  Ft.Xml.Domlette
from Ft.Xml import Parse
from Ft.Xml.Xslt import Processor
from Ft.Xml import InputSource
from Ft.Lib.Uri import Absolutize

GRDDL_NS = Namespace("http://www.w3.org/2003/g/data-view#";)
GRDDL_URI = u'http://www.w3.org/2003/g/data-view'
XHTML_NS = u"http://www.w3.org/1999/xhtml";
#Built-in list of namespace uri's that should terminate any recursive namespace dispatch
NSDispatchTermination = [XHTML_NS]
#Flag to determine whether or not to only attempt to parse a URL as XML if it's mime-type
#is appropriate
CHECK_XML_MIMETYPE = False

class Glean(object):
    """
    Handles all the GRDDL XML parsing and XSLT transformation from URLs
    """
    def __init__(self, url, graph, preParsedDOM=None):
        """
        
        >>> g = Glean(u'http://www.w3.org/2003/g/po-doc.xml',Graph())
        >>> g.dom.documentElement.localName
        u'purchaseOrder'
        
        """
        self.graph = graph
        self.url = url
        if preParsedDOM:
            self.dom = preParsedDOM 
        else:
            req = urllib2.Request(url)
            try:
                #peek in response headers to determine content-type
                u = urllib2.urlopen(req)
                self.headers = u.info()
                if re.match(r'(?:text|application)/.*\+?xml', self.headers['content-type']) is None and CHECK_XML_MIMETYPE:
                    #What does the spec mandate about this scenario?
                    if DEBUG:
                        print "Ignoring non-xml information resource", url
                        print self.headers['content-type'].split(';')[0]
                    self.dom = None
                else:
                    rt = u.read()
                    self.dom = Ft.Xml.Domlette.NonvalidatingReader.parseString(rt, url)
            except Exception, e:
                if DEBUG:
                    print "Unable to parse ", url, repr(e)
                #Unable to glean.  Fail gracefully..
                self.dom = None
        self.appliedTransforms = []
        
    def transform(self, transformURLs):
        """
        Takes a space seperated list of transform url's and applies them against thhe
        pre-parsed DOM of the GRDDL source - making sure to avoid transformation already applied
        """                
        for xformURL in transformURLs.split():
            if DEBUG:
                print "applying transformation %s" % (xformURL)
            if xformURL not in self.appliedTransforms:
                self.appliedTransforms.append(xformURL)
            #The transform url is resolved against the source URL (to accomodate relative urls)
            transform = InputSource.DefaultFactory.fromUri(Absolutize(xformURL, self.url))
            processor = Processor.Processor()
            processor.appendStylesheet(transform)
            #see: http://www.w3.org/TR/grddl/#stylepi
            result = processor.runNode(self.dom, self.url, ignorePis=1)
            #get output method / media-type
#            <!-- Category: top-level-element -->
#            <xsl:output
#              method = "xml" | "html" | "text" | qname-but-not-ncname
#              version = nmtoken
#              encoding = string
#              omit-xml-declaration = "yes" | "no"
#              standalone = "yes" | "no"
#              doctype-public = string
#              doctype-system = string
#              cdata-section-elements = qnames
#              indent = "yes" | "no"
#              media-type = string />

            #How to accomodate @media-type?
            method = processor.outputParams.method[-1]
            currLen = len(self.graph)
            if method == 'xml':
                self.graph.parse(StringIO(result), publicID=self.url)
                if DEBUG:
                    print "Parsed %s triples as RDF/XML"%(max(0,len(self.graph) - currLen))
            elif method == 'text':
                #Attempt a Notation 3 parse (covers NTriples, and Turtle)
                self.graph.parse(StringIO(result), format='n3', publicID=self.url)
                if DEBUG:
                    print "Parsed %s triples as Notation 3"%(max(0,len(self.graph) - currLen))
            else:
                #HTML result - recursive GRDDL mechanism?
                raise Exception("unsupported output type")
class GRDDLAgent:
    """
    The main entry point for the GRDDL agent
    Takes a url and a graph to store the GRDDL result and attempts to 'glean'
    in the 4 major ways that GRDDL mandates
    """
    def __init__(self, url, graph):
        if DEBUG:
            print "Attempting a comprehensive glean of ", url
        parsedSource = None
        for gleanMethod in [XMLGlean, XMLNSGlean, XHTMLProfileGlean, ValidXHTMLGlean]:
            #Don't reparse the GRDDL source
            if not parsedSource:
                gleaned = gleanMethod(url, graph)
                parsedSource = gleaned.dom                
            elif parsedSource:
                gleanMethod(url, graph, preParsedDOM=parsedSource)            

class XMLGlean(Glean):
    """
    http://www.w3.org/TR/grddl/#grddl-xml - Adding GRDDL to well-formed XML
    
    The general form of associating a GRDDL transformation link with a well-formed 
    XML document is by adorning the root element with a grddl namespace declaration 
    and a grddl:transformation attribute whose value is a URI reference, or list 
    of URI references, that refer to executable scripts or programs which are 
    expected to transform the source document into RDF.
        
    """
    def __init__(self, url, graph, preParsedDOM=None):
        """        
        >>> g = XMLGlean(u'http://www.w3.org/2003/g/po-ex', Graph())
        >>> g.appliedTransforms[0]
        u'http://www.w3.org/2003/g/embeddedRDF.xsl'
        >>> pprint(list(g.graph))
        [(u'http://www.w3.org/2003/g/po-ex',
          u'http://www.w3.org/2003/g/data-view#namespaceTransformation',
          u'http://www.w3.org/2003/g/grokPO.xsl')]
        """
        super(XMLGlean, self).__init__(url, graph)
        if self.dom:
            self.xforms = [attr.value for attr in self.dom.xpath(u'/*/@data-view:transformation', {u'data-view':u'http://www.w3.org/2003/g/data-view#'})]
            if self.xforms:
                self.transform(self.xforms[0])

class XMLNSGlean(Glean):
    """
    http://www.w3.org/TR/grddl/#ns-bind - Using GRDDL with XML Namespace Documents
    
    Any resource available for retrieval from a namespace URI is a namespace document 
    (cf. section 4.5.4. Namespace documents in [WEBARCH]). For example, a namespace 
    document may have an XML Schema representation or an RDF Schema representation, 
    or perhaps both
        
    To associate a GRDDL transformation with a whole dialect, have the namespace document 
    include the grddl:namespaceTransformation property.
    
    * if an information resource ?D  has an XML representation whose root element has a 
      namespace name ?NS then any GRDDL result of the resource identified by ?NS  is a GRDDL 
      result of ?D
    * if an information resource ?D has an XML representation whose root element has a 
      namespace name ?NSDOC** and ?D has a GRDDL result that includes, for any ?TX, the RDF 
      triple { ?NSDOC <http://www.w3.org/2003/g/data-view#namespaceTransformation> ?TX } then 
      ?TX is also a transformation of ?D            
        
    """
    def __init__(self, url, graph, preParsedDOM=None):
        """
        
        >>> g = XMLNSGlean(u'http://www.w3.org/2003/g/po-doc.xml', Graph())
        >>> g.nsURI
        u'http://www.w3.org/2003/g/po-ex'
        >>> len(g.graph)
        16
        """
        super(XMLNSGlean, self).__init__(url, graph)
        self.nsURI = None
        if self.dom:
            self.nsURI = self.dom.xpath(u'/*')[0].namespaceURI
            if self.nsURI and self.nsURI not in NSDispatchTermination:
                #glean GRDDL result from the namespace document
                GRDDLAgent(self.nsURI, self.graph)
                continueRecursion = True
                #setup a set of processed transforms to avoid infinite namespace snooping cycles
                processedNSXForms = Set()
                #Recursively find 'new' namespace transformations
                while continueRecursion:
                    todoXForms = Set()
                    for s, p, xform in self.graph.triples((self.nsURI, GRDDL_NS.namespaceTransformation, None)):
                        if xform not in processedNSXForms:
                            todoXForms.add(xform)
                    #continue only if we have xforms to apply
                    continueRecursion = bool(todoXForms)
                    #apply the new namespace transforms on the GRDDL source, merging the GRDDL results as we go
                    for newXForm in todoXForms:
                        self.transform(newXForm)
                        processedNSXForms.add(newXForm)

class ValidXHTMLGlean(Glean):
    """
    http://www.w3.org/TR/grddl/#grddl-xhtml - Using GRDDL with valid XHTML
    
    The general form of adding a GRDDL assertion to a valid XHTML document 
    is by specifying the GRDDL profile in the profile attribute of the head 
    element, and transformation as the value of the rel attribute of a link 
    or a element whose href attribute value is a URI reference that refers 
    to an executable script or program which is expected to transform the 
    source document into RDF. This method is suitable for use with valid 
    XHTML documents which are constrained by an XML DTD.
    
    Stated more formally:

    * An XHTML document whose metadata profiles include 
      http://www.w3.org/2003/g/data-view has a GRDDL transformation for each 
      resource identified by a link of type transformation.
              
    """
    def __init__(self, url, graph, preParsedDOM=None):
        super(ValidXHTMLGlean, self).__init__(url, graph)
        if self.dom:
            xhtmlNSMap = {u'xhtml':XHTML_NS}
            for xform in self.dom.xpath(u'/xhtml:html[xhtml:head[contains(@profile, "%s")]]//xhtml:*[(local-name() = "a" or local-name() = "link") and @rel="transformation"]/@href'%GRDDL_URI, xhtmlNSMap):
                self.transform(xform.value)

class XHTMLProfileGlean(Glean):
    """
    http://www.w3.org/TR/grddl/#profile-bind - GRDDL for HTML Profiles
    
    A more formal description on the relation between GRDDL and XHTML profiles follows:

    * if an information resource ?D has an XHTML representation whose profile attribute 
      refers to ?PROFILE, then any GRDDL result of ?PROFILE is a GRDDL result of ?D
    * if an information resource ?D has an XHTML representation whose profile attribute 
      refers to ?PROFILE and ?D has a GRDDL result that includes, for any ?TX, the RDF triple 
      { ?PROFILE <http://www.w3.org/2003/g/data-view#profileTransformation> ?TX } then ?TX 
      is also a GRDDL transformation of ?D    
    
    """
    def __init__(self, url, graph, preParsedDOM=None):
        """
        >>> g = XHTMLProfileGlean(u'http://www.w3.org/2003/g/data-view', Graph())
        >>> g.profiles
        >>> len(g.graph)
        
        """
        super(XHTMLProfileGlean, self).__init__(url, graph)
        self.profiles = []
        if self.dom:
            profile = self.dom.xpath(u'/xhtml:html/xhtml:head/@profile', {u'xhtml':XHTML_NS})
            if profile:
                self.profiles = profile[0].value.split()
                for profile in self.profiles:
                    if profile == GRDDL_URI:
                        continue
                    if DEBUG:
                        print "processing profile url: ", profile
                    #glean GRDDL result from the profile document
                    GRDDLAgent(profile, self.graph)
                    continueRecursion = True
                    #setup a set of processed transforms to avoid infinite profile snooping cycles
                    processedProfileXForms = Set()
                    #Recursively find 'new' namespace transformations
                    while continueRecursion:
                        todoXForms = Set()
                        for s, p, xform in self.graph.triples((profile, GRDDL_NS.profileTransformation, None)):
                            if xform not in processedProfileXForms:
                                todoXForms.add(xform)
                        #continue only if we have xforms to apply
                        continueRecursion = bool(todoXForms)
                        #apply the new namespace transforms on the GRDDL source, merging the GRDDL results as we go
                        for newXForm in todoXForms:
                            self.transform(newXForm)
                            processedProfileXForms.add(newXForm)

        
OWL_NS    = Namespace("http://www.w3.org/2002/07/owl#";)
Any = None
LOG = Namespace("http://www.w3.org/2000/10/swap/log#";)
    
def usage():    
    print "GRDDL.py [--test] [--help] [--debug] [--output-format=<'n3' or 'ntriples' or 'xml'>] [--ns=prefix=namespaceUri]* srcURL"
    
def main(argv):
    global DEBUG
    
    if argv is None: argv = sys.argv
    try:
        opts, args = getopt.getopt(argv[1:], "",
                                   ["help", "output-format=","debug","ns="])
    except getopt.GetoptError,e:
        print e
        usage()
        return 2
    output = 'xml'
    DEBUG=False    
    docTesting = False    
    nsBinds = {
        'rdf' : RDF.RDFNS,
        'rdfs': RDFS.RDFSNS,
        'owl' : "http://www.w3.org/2002/07/owl#";       
    }
    for o, a in opts:
        if o == "--test":
            docTesting = True            
        elif o == "--ns=":
            pref,nsUri = a.split('=')
            nsBinds[pref]=nsUri
        elif o == "--debug":
            DEBUG = True
        elif o == "--help":
            usage()
            return 0
        elif o == "--output-format":
            output = a
    if len(argv) < 2:
        usage()
        return 2
    elif docTesting:
        test()
        return 2
    graph = Graph()
    namespace_manager = NamespaceManager(Graph())
    for prefix,uri in nsBinds.items():
        namespace_manager.bind(prefix, uri, override=False)        
    graph.namespace_manager = namespace_manager
    g=GRDDLAgent(argv[-1], graph)
    print graph.serialize(format=output)
    
def test():
    import doctest
    doctest.testmod()

if __name__ == '__main__':
    main(sys.argv)

#!/usr/bin/env python
"""
Test harness for GRDDL test suite. Uses RDFLib to process test manifest and perform graph
isomorphism testing against expected output.  Uses Sean B. Palmer's RDF Graph Isomorphism Tester

Copyright (c) 2006, Sean B. Palmer, Chimezie Ogbuji
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the name of inamidst.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

[ a e:Assertion; e:assertedBy [ foaf:homepage <http://www.w3.org/People/Connolly> ]; e:testSubject
          <glean.py>; e:test [ = <grddl-wg/td/testlist1#sq1a>; a e:TestCase ]; e:testResult [ a e:TestResult; e:validity
          e:pass ] ] .


"""
from pprint import pprint
from sets import Set
from rdflib.Namespace import Namespace
from rdflib import plugin,RDF,RDFS,URIRef,URIRef,Literal,Variable,BNode
from rdflib.store import Store
from cStringIO import StringIO
from rdflib.Graph import Graph,ReadOnlyGraphAggregate,ConjunctiveGraph
import sys, getopt
import os, tempfile
import urllib2

DC_NS = Namespace('http://purl.org/dc/elements/1.1/')

def compare(p, q): 
   return hash(Graph(p)) == hash(Graph(q))

class IsomorphicTestableGraph(Graph):
    """
    Ported from http://www.w3.org/2001/sw/DataAccess/proto-tests/tools/rdfdiff.py
     (Sean B Palmer's RDF Graph Isomorphism Tester)
    """
    def __init__(self, **kargs): 
        super(IsomorphicTestableGraph,self).__init__(**kargs)
        self.hash = None
        
    def internal_hash(self):
        """
        This is defined instead of __hash__ to avoid a circular recursion scenario with the Memory
        store for rdflib which requires a hash lookup in order to return a generator of triples
        """ 
        return hash(tuple(sorted(self.hashtriples())))

    def hashtriples(self): 
        for triple in self: 
            g = ((isinstance(t,BNode) and self.vhash(t)) or t for t in triple)
            yield hash(tuple(g))

    def vhash(self, term, done=False): 
        return tuple(sorted(self.vhashtriples(term, done)))

    def vhashtriples(self, term, done): 
        for t in self: 
            if term in t: yield tuple(self.vhashtriple(t, term, done))

    def vhashtriple(self, triple, term, done): 
        for p in xrange(3): 
            if not isinstance(triple[p], BNode): yield triple[p]
            elif done or (triple[p] == term): yield p
            else: yield self.vhash(triple[p], done=True)
      
    def __eq__(self, G): 
        """Graph isomorphism testing."""
        if not isinstance(G, IsomorphicTestableGraph): return False
        elif len(self) != len(G): return False
        elif list.__eq__(list(self),list(G)): return True # @@
        return self.internal_hash() == G.internal_hash()

    def __ne__(self, G): 
       """Negative graph isomorphism testing."""
       return not self.__eq__(G)

def runProcessor(processor,inputUri):
    if DEBUG:
        print "running: ", processor + " " + inputUri
    return os.popen(processor + " " + inputUri,"r")    

def updateTest(processor,inputUri,outputUri):
    outputfilename = outputUri[outputUri.rfind('/')+1:]
    print "Updating ",outputfilename
    outputfile = open(outputfilename,"w")
    output = runProcessor(processor,inputUri)
    outputfile.write(output.read())
    outputfile.close()

# Returns false when applying processor on inputUri differs from outputUri
def runTest(processor,inputUri,outputUri):
    output = runProcessor(processor,inputUri)
    outputfilename = outputUri[outputUri.rfind('/')+1:] + ".result"
    if DEBUG:
        print "Saving result to ",outputfilename
    outputfile = open(outputfilename,"w")
    outputfile.write(output.read())
    outputfile.close()
    expected = IsomorphicTestableGraph().parse(outputUri)
    try:
        actual = IsomorphicTestableGraph().parse(outputfilename)
    except:
        if DEBUG:
            print "problems parsing result"
        return False
    if len(actual) != len(expected):
        if DEBUG:
            print "Unequal lengths: expected = %s actual = %s"%(len(expected),len(actual))
    rt = actual == expected
    if DEBUG and actual != expected:
        print "Missing: "
        pprint(list(Set(list(expected)).difference(list(actual))))
    return rt

def process(action,uri,processor):
    data = Graph()
    data.parse(uri)

    nsMapping = {
        u'test':Namespace('http://www.w3.org/2000/10/rdf-tests/rdfcore/testSchema#'),
        u'dc':DC_NS
    }
    hasFailure = 0
    for descr,test,input,output in data.query("SELECT ?desc ?t ?i ?o WHERE { ?t test:inputDocument ?i. ?t a test:Test . ?t dc:title ?desc. ?t test:outputDocument ?o }",initNs=nsMapping):
        if DEBUG:
            print "###", descr, "###"
            print "\t",input
        if action=="update":
            updateTest(processor,input,output)
        elif action=="run":
            if not runTest(processor,input,output):
                print "* %s failed" % test
                hasFailure = 1
    if not hasFailure and action=="run":
        print "All tests were passed!"

def main(argv=None):
    if argv is None: argv = sys.argv

    try:
        opts, args = getopt.getopt(argv[1:], "dhr:u:",
                                   ["help", "run=", "update=","debug"])
    except getopt.GetoptError:
        usage()
        return 2
    processor = None
    action = None
    tests = None
    global DEBUG
    DEBUG=0
    for o, a in opts:
        if o in ("-d", "--debug"):
            DEBUG = 1
        if o in ("-h", "--help"):
            usage()
            return 0
        if o in ("-r", "--run"):
            processor = a
            action = "run"
        if o in ("-u", "--update"):
            processor = a
            action = "update"
    if not (processor and action and len(args) == 1):
        usage()
        return 2
    # URI of the list of tests
    tests = args[0]
    process(action,tests,processor)

    return 0

def usage():
    print __doc__
    print __version__
    
if __name__ == '__main__':
    sys.exit(main())

GRDDL implementation and testHarness port

Reply via email to