Jon a écrit :
> Hi, I'm new to Rhino and would like to make a scraper using xpath 
> queries. Specifically, I wanted to utilize firefox's xpath evaluation 
> (document.evaluate), which seems to handle lots of "dirty" html pages 
> quite well. Normally, I use/write greasy scripts, but I wanted to run 
> the scraper on a server machine.
> 
> I've found some example scrappers, but they all use regular expressions. 
> I would like to use xpath as firebug makes it easy to scrape from it's 
> xpath links.
> 
> Any thoughts?
> -Jon

You can build on John Resig work:
http://ejohn.org/blog/bringing-the-browser-to-the-server/

and on this:

        // XPath stuff                  
        var xpathFactory = new javax.xml.xpath.XPathFactory.newInstance();
        var xpathTypeNodeSet = javax.xml.xpath.XPathConstants.NODESET;
        var xpathTypeNode = javax.xml.xpath.XPathConstants.NODE;
        var xpathTypeBoolean = javax.xml.xpath.XPathConstants.BOOLEAN;
        var xpathTypeNumber = javax.xml.xpath.XPathConstants.NUMBER;
        
        var XPathResult = function(_dom, xpath) {
                this._dom = _dom;
                this._xpath = xpath;
        }
        XPathResult.prototype = {
                toString: function() {
                        return this._xpath.evaluate(this._dom);
                },
                toBoolean: function() {
                        return this._xpath.evaluate(this._dom, 
xpathTypeNodeBoolean);
                },
                toNumber: function() {
                        return this._xpath.evaluate(this._dom, 
xpathTypeNodeNumber);
                },
                get nodes() {
                        return new DOMNodeList(this._xpath.evaluate(this._dom, 
xpathTypeNodeSet));
                },
                set nodes(n) {
                        if (n instanceof DOMDocument) n = n.documentElement;
                        var _dom = this._dom;
                        var doc = _dom.ownerDocument || makeNode(_dom);
                        if (!(n instanceof DOMNode)) n = 
doc.createTextNode(String(n));
                        this.nodes.forEach(function(node) { 
node.parentNode.replaceChild(doc.importNode(n, true), node)});
                },
                get node() {
                        return makeNode(this._xpath.evaluate(this._dom, 
xpathTypeNode));
                },
                set node(n) {
                        if (n instanceof DOMDocument) n = n.documentElement;
                        var _dom = this._dom;
                        var doc = _dom.ownerDocument || makeNode(_dom);
                        if (!(n instanceof DOMNode)) n = 
doc.createTextNode(String(n));
                        var oldNode = this.node;
                        oldNode.parentNode.replaceChild(doc.importNode(n, 
true), oldNode);
                }
        }
        
        var xpathMixin = {
                xpath: function (sxpath) {
                        var xpath = xpathFactory.newXPath();
                        return new XPathResult(this._dom, 
xpath.compile(sxpath));
                },
                select: function (sxpath) {
                        return this.xpath(sxpath).nodes;
                }
        };
        [DOMNode, DOMNodeList, DOMDocument].forEach(function(x){ 
extend(x.prototype, xpathMixin); });
_______________________________________________
dev-tech-js-engine-rhino mailing list
[email protected]
https://lists.mozilla.org/listinfo/dev-tech-js-engine-rhino

Reply via email to