Here's something to get you going, which scrapes the pages in two different
ways:
- using regexp (slightly more elaborate than yours) (no sorting, no
totaling)
- using XSLT (sorting, no totaling)
Interestingly, the XSLT version seems substantially faster, even with all
the parsing that is entailed.
P.S. To me the "checkins" reported by markmail don't seem correct, way too
low.
Jonathan Marsh - http://www.wso2.com - http://auburnmarshes.spaces.live.com
> -----Original Message-----
> From: [EMAIL PROTECTED] [mailto:[EMAIL PROTECTED]
> On Behalf Of Samisa Abeysinghe
> Sent: Monday, May 19, 2008 2:07 PM
> To: [EMAIL PROTECTED]; [email protected]
> Subject: Re: [mashup-dev] Regex with Scrapers
>
> Keith Chapman wrote:
> > Hi Samisa,
> >
> > Yes the scraper supports regex. We haven't got a specific sample for
> > this in the Mashup Server. The can use regex in the scraper config as
> > explained in http://web-harvest.sourceforge.net/manual.php#regexp
>
> I will look into that.
>
> What I want is to implement [1] with Mashups server. As you can see,
> with PHP, I scrape and then sort and display.
> I want to scrape and sort with my Mashup service :)
>
> Samisa...
>
> [1] http://ww2.wso2.org/~samisa/wso2_mailing_lists.phps
>
> >
> > Thanks,
> > Keith.
> >
> >
> > Samisa Abeysinghe wrote:
> >> Can I use a regex when scraping html content?
> >>
> >> What I have seen in samples is XSLT.
> >>
> >> Thanks,
> >> Samisa...
> >>
> >
> >
> > _______________________________________________
> > Mashup-dev mailing list
> > [email protected]
> > http://www.wso2.org/cgi-bin/mailman/listinfo/mashup-dev
> > ---------------------------------------------------------------------
> ---
> >
> >
> > No virus found in this incoming message.
> > Checked by AVG.
> > Version: 8.0.100 / Virus Database: 269.23.20/1453 - Release Date:
> 5/18/2008 9:31 AM
> >
>
>
> --
> Samisa Abeysinghe
> Director, Engineering; WSO2 Inc.
>
> http://www.wso2.com/ - "The Open Source SOA Company"
>
>
> _______________________________________________
> Mashup-dev mailing list
> [email protected]
> http://www.wso2.org/cgi-bin/mailman/listinfo/mashup-dev
/*
* Copyright 2008 WSO2, Inc. http://www.wso2.org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
Created 2008-05 Jonathan Marsh; [EMAIL PROTECTED]
*/
function summarizeActivity(user) {
function scrape2xmllist(scrape) {
var lists = scrape.split(",");
var response = <></>;
for (var i in lists) {
if (lists[i] != "") {
var item = lists[i].split(":");
response += <list name={item[0]} count={item[1]}/>;
}
}
return response;
}
var timestamp = new Date().valueOf();
var scraper;
var config =
<config>
<var-def name='summary'>
<regexp>
<regexp-pattern><![CDATA[<td[^>]*><a[^>]*>(.*?)</a></td><td[^>]*>(\d+)</td>]]></regexp-pattern>
<regexp-source>
<http method='get' url='filled-in-later' />
</regexp-source>
<regexp-result>
<template>${_1}:${_2},</template>
</regexp-result>
</regexp>
</var-def>
</config>;
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:" + user + " -type:checkins -subject:jira";
scraper = new Scraper(config);
var all = scrape2xmllist(scraper.summary);
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:" + user + " type:checkins";
scraper = new Scraper(config);
var checkins= scrape2xmllist(scraper.summary);
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:" + user + " subject:jira";
scraper = new Scraper(config);
var jiras = scrape2xmllist(scraper.summary);
var summary = <summary user={user} elapsed={new Date().valueOf() - timestamp}>
<filter type="all">{all}</filter>
<filter type="checkins">{checkins}</filter>
<filter type="jiras">{jiras}</filter>
</summary>;
return summary;
}
function summarizeActivityXSLT(user) {
var timestamp = new Date().valueOf();
var scraper;
var config =
<config>
<var-def name='summary'>
<xslt>
<xml>
<html-to-xml>
<http method='get' url='filled-in-later' />
</html-to-xml>
</xml>
<stylesheet><![CDATA[
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" omit-xml-declaration="yes"/>
<xsl:template match="/">
<filter>
<xsl:for-each select="//tr/td[a]">
<xsl:sort select="following-sibling::td" data-type="number"/>
<list name="{a}" count="{following-sibling::td}"/>
</xsl:for-each>
</filter>
</xsl:template>
</xsl:stylesheet>
]]></stylesheet>
</xslt>
</var-def>
</config>;
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:" + user + " -type:checkins -subject:jira";
scraper = new Scraper(config);
var all = new XML(scraper.summary);
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:" + user + " type:checkins";
scraper = new Scraper(config);
var checkins = new XML(scraper.summary);
config["var-def"[EMAIL PROTECTED] = "http://markmail.org/browse?q=list:wso2 from:" + user + " subject:jira";
scraper = new Scraper(config);
var jiras = new XML(scraper.summary);
var summary = <summary user={user} elapsed={new Date().valueOf() - timestamp}>
<filter type="all">{all.*}</filter>
<filter type="checkins">{checkins.*}</filter>
<filter type="jiras">{jiras.*}</filter>
</summary>;
return summary;
}_______________________________________________
Mashup-dev mailing list
[email protected]
http://www.wso2.org/cgi-bin/mailman/listinfo/mashup-dev