This is an automated email from the ASF dual-hosted git repository.
curcuru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git
The following commit(s) were added to refs/heads/master by this push:
new b958494 Reorganize file; use CHECK_CAPTURE values; improve command
line args
b958494 is described below
commit b958494e866e4aae61d5f1f75e35ac9e1e2d29b1
Author: Shane Curcuru <[email protected]>
AuthorDate: Fri May 11 14:48:33 2018 -0400
Reorganize file; use CHECK_CAPTURE values; improve command line args
---
tools/site-scan.rb | 216 +++++++++++++++++++++++++----------------------------
1 file changed, 103 insertions(+), 113 deletions(-)
diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index 8e75b35..88cd3d4 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -1,66 +1,86 @@
#!/usr/bin/env ruby
-$LOAD_PATH.unshift File.realpath(File.expand_path('../../lib', __FILE__))
-
-#
-# Scans committee pages for compliance with requirements and recommendations:
+# Scans Apache project homepages and captures text|urls for common links
+# Gathers data that can be used to check for policy compliance:
# https://www.apache.org/foundation/marks/pmcs#navigation
# http://www.apache.org/events/README.txt
+# See Also: lib/whimsy/sitestandards.rb
#
# Makes no value judgements. Simply extracts raw data for offline analysis.
-#
-
-require 'whimsy/asf'
+$LOAD_PATH.unshift File.realpath(File.expand_path('../../lib', __FILE__))
require 'net/http'
require 'nokogiri'
require 'json'
-
+require 'whimsy/asf'
require 'whimsy/cache'
+require 'whimsy/sitestandards'
+IMAGE_DIR = ASF::SVN.find('site-img')
+
+# Normalize spaces in text runs
def squash(text)
- text.scrub.gsub(/[[:space:]]+/, ' ').strip
+ return text.scrub.gsub(/[[:space:]]+/, ' ').strip
end
-#########################################################################
-
-IMAGE_DIR = ASF::SVN.find('site-img')
+# Get text from a node; use parent if text does not appear to be complete
+# This is used when scanning for some links that may
+# be in an image or other commonly related node on websites
+def getText(txt, node, match=/Apache Software Foundation/i)
+ parent = nil # debug to show where parent needed to be fetched
+ if not txt =~ match # have we got all the text?
+ if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra
text.
+ newnode = node.parent.parent
+ else
+ newnode = node.parent
+ end
+ # ensure <br> is treated as a separator when extracting the combined text
+ newnode.css('br').each{ |br| br.replace(" ") }
+ txt = squash(newnode.text)
+ parent = true
+ end
+ return txt, parent
+end
+# Parse an Apache project website and return text|urls that match our checks
+# @return Hash of symbols: text|url found from a check made
+# @see SiteStandards for definitions of what we should scan for (in general)
def parse(id, site, name)
+ data = {}
+ SiteStandards::COMMON_CHECKS.keys.each do |k|
+ data[k.to_sym] = nil
+ end
+ data[:display_name] = name
+ data[:uri] = site
uri = URI.parse(site)
-
- # default data
- data = {
- display_name: name,
- uri: site,
- events: nil,
- foundation: nil,
- license: nil,
- sponsorship: nil,
- security: nil,
- trademarks: nil,
- copyright: nil,
- image: nil,
- }
-
- # check if site exists
begin
Socket.getaddrinfo(uri.host, uri.scheme)
- rescue SocketError
+ rescue SocketError => se
+ data[:errors] = se.message
+ return data
+ end
+ begin
+ uri, response, status = $cache.get(site.to_s)
+ rescue IOError => ioe
+ data[:errors] = ioe.message
return data
end
-
- uri, response, status = $cache.get(site.to_s)
$stderr.puts "#{id} #{uri} #{status}"
- return data if response.respond_to? :code and response.code =~ /^[45]/
+ # Bail and return if getting the site returns an error code
+ if response.respond_to? :code and response.code =~ /^[45]/
+ data[:errors] = "cache.get(#{site.to_s}) error code #{response.code}"
+ return data
+ end
doc = Nokogiri::HTML(response)
data[:uri] = uri.to_s
- # scan each link
+ # FIRST: scan each link's a_href to see if we need to capture it
doc.css('a').each do |a|
-
- # check the link targets
+ # Normalize the text and href for our capture purposes
a_href = a['href'].to_s.strip
+ a_text = a.text.downcase.strip
+ $stderr.puts "#{a_text} #{a_href}" if $verbose
- if a_href =~ %r{^https?://(www\.)?apache\.org/?$}
+ # Check the href urls for some patterns
+ if a_href =~
SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
img = a.at('img')
if img
# use the title (hover text) in preference to the source
@@ -70,7 +90,7 @@ def parse(id, site, name)
end
end
- if a_href.include? 'apache.org/events/'
+ if a_href =~
SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
img = a.at('img')
if img
data[:events] = uri + img['src'].strip
@@ -79,71 +99,52 @@ def parse(id, site, name)
end
end
- # check the link text
- a_text = a.text.downcase.strip
- $stderr.puts "#{a_text} #{a_href}" if $verbose
-
- # Link text is supposed to be just "License" according to:
- # https://www.apache.org/foundation/marks/pmcs#navigation
- if a_text =~ /^license$/ and a_href.include? 'apache.org'
+ # Check the a_text strings for other patterns
+ # Note this is an unusual case
+ if (a_text =~
SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
+ (a_href =~
SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
begin
data[:license] = uri + a_href
rescue
data[:license] = a_href
end
end
-
- if a_text =~ /\Athanks[!]?\z/ # Allow Thanks! with exclamation
- begin
- data[:thanks] = uri + a_href
- rescue
- data[:thanks] = a_href
- end
- end
-
- if a_text == 'security'
- begin
- data[:security] = uri + a_href
- rescue
- data[:security] = a_href
- end
- end
-
- if a_text =~
%r{sponsorship|donate|sponsor\sapache|sponsoring\sapache|sponsor}
- begin
- data[:sponsorship] = uri + a_href
- rescue
- data[:sponsorship] = a_href
+
+ %w(thanks security sponsorship).each do |check|
+ if a_text =~
SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
+ begin
+ data[check.to_sym] = uri + a_href
+ rescue
+ data[check.to_sym] = a_href
+ end
end
end
end
- # Now scan the page text
+ # SECOND: scan each text node to match and capture
doc.traverse do |node|
next unless node.is_a?(Nokogiri::XML::Text)
-
txt = squash(node.text)
-
# allow override if phrase looks good
- if (txt =~ /\btrademarks\b/ and not data[:trademarks]) or txt =~/are
trademarks of [Tt]he Apache Software/
+ if (txt =~
SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and
not data[:trademarks]) or txt =~/are trademarks of [Tt]he Apache Software/
t, p = getText(txt, node)
# drop previous text if it looks like Copyright sentence
data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/,'').strip
data[:tradeparent] = p if p
end
- if txt =~ /Copyright / or txt =~ /©/
+ if txt =~
SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
t, p = getText(txt, node)
# drop text around the Copyright (or the symbol)
data[:copyright] = t.sub(/^.*?((Copyright|©) .+?
Foundation[.]?).*/,'\1').strip
data[:copyparent] = p if p
end
- if txt =~ / Incubation is required of all newly accepted projects /
+ # Note we also check for incubator disclaimer (immaterial of tlp|podling)
+ if txt =~
SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
t, p = getText(txt, node, / is an effort undergoing/)
data[:disclaimer] = t
end
end
-
- # see if image has been uploaded
+ # THIRD: see if an image has been uploaded
if IMAGE_DIR
data[:image] = Dir[File.join(IMAGE_DIR, "#{id}.*")].
map {|path| File.basename(path)}.first
@@ -152,65 +153,52 @@ def parse(id, site, name)
return data
end
-# get the text; use parent if text does not appear to be complete
-def getText(txt, node, match=/Apache Software Foundation/i)
- parent = nil # debug to show where parent needed to be fetched
- if not txt =~ match # have we got all the text?
- if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra
text.
- newnode = node.parent.parent
- else
- newnode = node.parent
- end
- # ensure <br> is treated as a separator when extracting the combined text
- newnode.css('br').each{ |br| br.replace(" ") }
- txt = squash(newnode.text)
- parent = true
- end
- return txt, parent
-end
-
-$verbose = ARGV.delete '--verbose'
-
+#########################################################################
+# Main execution begins here
results = {}
-
podlings = {}
-
$cache = Cache.new(dir: 'site-scan')
+$verbose = ARGV.delete '--verbose'
-# Parse a single site given its URL
-if (1..2).include? ARGV.length and ARGV.first =~ /^https?:\/\/\w/
+# USAGE:
+# site-scan.rb https://whimsical.apache.org [whimsy] [whimsy-scan.json] - to
scan one project
+# site-scan.rb [project-output.json] [podlings-output.json] [projname
podlingname ...]
+# If additional projname|podlingname are provided, only scans those sites
+if ARGV.first =~ /^https?:\/\/\w/
+ # Scan a single URL provided by user
site = ARGV.shift
name = ARGV.shift || site[/\/(\w[^.]*)/, 1].capitalize
+ output_projects = ARGV.shift
results[name] = parse(name, site, name)
else
+ # Gather output filenames (if any) and scan various projects
if ARGV.first =~ %r{[./]} # have we a file name?
- outfile = ARGV.shift
+ output_projects = ARGV.shift
if ARGV.first =~ %r{[./]} # have we another file name?
- outfile2 = ARGV.shift
+ output_podlings = ARGV.shift
else
- outfile2 = nil
+ output_podlings = nil
end
else
- outfile = nil
+ output_projects = nil
end
- # scan all committees, including non-pmcs
+
+ # Scan committees, including non-pmcs
ASF::Committee.load_committee_info
committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq
-
committees.sort_by {|committee| committee.name}.each do |committee|
next unless committee.site
- # if parameters specified, parse only those names
+ # if more parameters specified, parse only those names
if ARGV.length > 0
next unless ARGV.include? committee.name
end
-
- # fetch, parse committee site
results[committee.name] = parse(committee.name, committee.site,
committee.display_name)
-
end
+
+ # Scan podlings that have a website
ASF::Podling.list.each do |podling|
if podling.status == 'current' and podling.podlingStatus[:website]
- # if parameters specified, parse only those names
+ # if more parameters specified, parse only those names
if ARGV.length > 0
next unless ARGV.include? podling.name
end
@@ -219,12 +207,14 @@ else
end
end
-# Output results
-if outfile
- File.write(outfile, JSON.pretty_generate(results))
+# Output all results
+if output_projects
+ File.write(output_projects, JSON.pretty_generate(results))
else
puts JSON.pretty_generate(results)
end
-if outfile2
- File.write(outfile2, JSON.pretty_generate(podlings))
+if output_podlings
+ File.write(output_podlings, JSON.pretty_generate(podlings))
+else
+ puts JSON.pretty_generate(podlings)
end
--
To stop receiving notification emails like this one, please contact
[email protected].