Daniel Turcanu has proposed merging lp:~danielturcanu/zorba/my_conv_module into lp:zorba/data-converters-module.
Requested reviews: Zorba Coders (zorba-coders) For more details, see: https://code.launchpad.net/~danielturcanu/zorba/my_conv_module/+merge/78588 Fixed the link crawler test -- https://code.launchpad.net/~danielturcanu/zorba/my_conv_module/+merge/78588 Your team Zorba Coders is requested to review the proposed merge of lp:~danielturcanu/zorba/my_conv_module into lp:zorba/data-converters-module.
=== modified file 'test_html/Queries/converters/html/link_crawler2.xq2' --- test_html/Queries/converters/html/link_crawler2.xq2 2011-10-06 07:40:17 +0000 +++ test_html/Queries/converters/html/link_crawler2.xq2 2011-10-07 13:01:14 +0000 @@ -18,6 +18,7 @@ import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map"; import module namespace html = "http://www.zorba-xquery.com/modules/converters/html"; import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml"; +import module namespace file = "http://expath.org/ns/file"; declare namespace ann = "http://www.zorba-xquery.com/annotations"; declare namespace xhtml="http://www.w3.org/1999/xhtml"; @@ -29,12 +30,9 @@ declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/"; -declare variable $supported-media-types as xs:string+ := ("text/xml", "application/xml", "text/xml-external-parsed-entity", "application/xml-external-parsed-entity", - "application/atom+xml", "text/html"); - - -declare variable $local:processed-internal-links:=xs:QName("processed-internal-links"); -declare variable $local:processed-external-links :=xs:QName("processed-external-links"); + +declare variable $local:processed-internal-links := xs:QName("processed-internal-links"); +declare variable $local:processed-external-links := xs:QName("processed-external-links"); declare %ann:sequential function local:create-containers() @@ -59,9 +57,13 @@ return if($sb = "") then $s1 else $sb }; -declare function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string +declare function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string? { - local:my-substring-before(resolve-uri($href, $start-uri), "#") + try{ + local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#") + } + catch* + { () } }; @@ -70,9 +72,11 @@ local:my-substring-before($http-call/httpsch:header[@name = 'Content-Type'][1]/string(@value), ";") }; -declare function local:alive($http-call as node()*) as xs:boolean +declare function local:alive($http-call as item()*) as xs:boolean { - if(($http-call[1]/@status eq 200)) then true() else false() + if((count($http-call) ge 1) and + ($http-call[1]/@status eq 200)) + then true() else fn:trace(false(), "alive") }; @@ -92,17 +96,24 @@ distinct-values( let $search := fn:analyze-string($content, "(<|&lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7") for $other-uri2 in $search//group[@nr=8]/string() - let $y:= fn:normalize-space($other-uri2) - return local:get-real-link($y, $uri) + return local:get-real-link($other-uri2, $uri) ) }; +declare %ann:sequential function local:process-link($x as xs:string, $n as xs:integer) as item()*{ + if(local:is-internal($x)) + then local:process-internal-link($x, $n); + else local:process-external-link($x); + +}; + declare %ann:sequential function local:process-external-link($x as xs:string){ if(not(empty(map:get($local:processed-external-links, $x)))) then exit returning false(); else {} + fn:trace($x, "HEAD external link"); variable $http-call:=(); try{ $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ()); @@ -127,39 +138,42 @@ declare %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){ - if($n=3) then exit returning (); else {} + (: if($n=3) then exit returning (); else {} :) if(not(empty(map:get($local:processed-internal-links, $x)))) then exit returning false(); else {} + fn:trace($x, "GET internal link"); variable $http-call:=(); try{ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ()); } - catch * {} - if( not(local:alive($http-call))) + catch * { } + if( not(local:alive($http-call))) then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();} else {} - if(not (local:get-media-type($http-call[1]) = $supported-media-types)) - then {map:insert($local:processed-internal-links, "clean", $x); exit returning ();} + + if(not (local:get-media-type($http-call[1]) = "text/html")) + then { map:insert($local:processed-internal-links, "clean", $x); exit returning ();} else {} - variable $string-content := xs:string($http-call[2]); + variable $string-content := string($http-call[2]); variable $content:=(); try{ $content:=html:parse($string-content,local:tidy-options() ); + map:insert($local:processed-internal-links, "clean", $x); } catch * - { - map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x); + { + map:insert($local:processed-internal-links, concat("cannot tidy ", $err:description), $x); try{ $content:=parse-xml:parse-xml-fragment ($string-content, ""); } catch * - { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);} + { map:insert($local:processed-internal-links, concat("cannot parse ", $err:description), $x);} } variable $links :=(); if(empty($content)) - then $links:=local:get-out-links-unparsed($string-content, $x); + then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed")); else $links:=local:get-out-links-parsed($content, $x); for $l in $links return local:process-link($l, $n+1); @@ -167,13 +181,6 @@ -declare %ann:sequential function local:process-link($x as xs:string, $n as xs:integer) as item()*{ - if(local:is-internal($x)) - then local:process-internal-link($x, $n); - else local:process-external-link($x); - -}; - declare function local:print-results() as element()* { @@ -190,13 +197,15 @@ variable $result; -try { - local:create-containers(); - local:process-link($uri, 1); - $result:=local:print-results() ; -} -catch * { $result:=concat("an error occurred", $err:description);} - +local:create-containers(); +local:process-link($uri, 1); +$result:=local:print-results() ; local:delete-containers(); -$result + +file:write(fn:resolve-uri("link_crawler_result.xml"), + <result>{$result}</result>, + <output:serialization-parameters> + <output:indent value="yes"/> + </output:serialization-parameters>) +
-- Mailing list: https://launchpad.net/~zorba-coders Post to : zorba-coders@lists.launchpad.net Unsubscribe : https://launchpad.net/~zorba-coders More help : https://help.launchpad.net/ListHelp