This is an automated email from the ASF dual-hosted git repository.
sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git
The following commit(s) were added to refs/heads/master by this push:
new 87295b5c Look for disclaimer away from home page
87295b5c is described below
commit 87295b5c305ba64447012d3285c97592b3c6d5d5
Author: Sebb <[email protected]>
AuthorDate: Thu Apr 25 01:03:47 2024 +0100
Look for disclaimer away from home page
TODO: add this into reports
---
tools/site-scan.rb | 45 ++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 40 insertions(+), 5 deletions(-)
diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index 4e267580..f57a832e 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -7,6 +7,7 @@
#
# Makes no value judgements. Simply extracts raw data for offline analysis.
$LOAD_PATH.unshift '/srv/whimsy/lib'
+require 'set'
require 'net/http'
require 'nokogiri'
require 'json'
@@ -66,7 +67,7 @@ end
# Parse an Apache project website and return text|urls that match our checks
# @return Hash of symbols: text|url found from a check made
# @see SiteStandards for definitions of what we should scan for (in general)
-def parse(id, site, name)
+def parse(id, site, name, podling=false)
data = {}
# force https to avoid issue with cache (sites should use https anyway)
site.sub!(%r{^http:},'https:')
@@ -102,8 +103,9 @@ def parse(id, site, name)
end
data[:uri] = uri.to_s
+ subpages = Set.new
# FIRST: scan each link's a_href to see if we need to capture it
- # also capture script src for events
+ # also capture script src for events, and some page refs for podlings
doc.traverse do |a|
if a.name == 'script'
@@ -157,6 +159,16 @@ def parse(id, site, name)
end
end
end
+ unless a_href =~ %r{^(#|mailto:)}
+ begin
+ site2 = URI.join(site,a_href.gsub(' ','+'))
+ if site2.host == uri.host and site2.path.size > 2
+ subpages.add site2.to_s
+ end
+ rescue StandardError
+ $stderr.puts "Bad a_href #{a_href}"
+ end
+ end
end
# SECOND: scan each text node to match and capture
@@ -183,6 +195,28 @@ def parse(id, site, name)
data[:disclaimer] = t
end
end
+
+ # Brief scan of initial sub-pages to look for disclaimers
+ # TODO also look for a download page?
+ if podling
+ hasdisclaimer = 0
+ nodisclaimer = []
+ subpages.each do |subpage|
+ begin
+ uri, response, status = $cache.get(subpage)
+ if response =~
SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
+ hasdisclaimer += 1
+ else
+ nodisclaimer << subpage
+ end
+ rescue URI::InvalidURIError
+ end
+ end
+ if nodisclaimer.size > 0
+ data[:disclaimers] = [hasdisclaimer, nodisclaimer]
+ end
+ end
+
# THIRD: see if an image has been uploaded
data[:image] = ASF::SiteImage.find(id)
@@ -299,10 +333,11 @@ puts "Started: #{Time.now}" # must agree with site-scan
monitor
# If additional projname|podlingname are provided, only scans those sites
if ARGV.first =~ /^https?:\/\/\w/
# Scan a single URL provided by user
- site = ARGV.shift
+ podling = ARGV.delete('--podling')
+ site = ARGV.shift.dup # needs to be unfrozen
name = ARGV.shift || site[/\/(\w[^.]*)/, 1].capitalize
output_projects = ARGV.shift
- results[name] = parse(name, site, name)
+ results[name] = parse(name, site, name, podling)
else
# Gather output filenames (if any) and scan various projects
if ARGV.first =~ %r{[./]} # have we a file name?
@@ -335,7 +370,7 @@ else
if ARGV.length > 0
next unless ARGV.include? podling.name
end
- podlings[podling.name] = parse(podling.name,
podling.podlingStatus[:website], podling.display_name)
+ podlings[podling.name] = parse(podling.name,
podling.podlingStatus[:website], podling.display_name, true)
end
end
end