[whimsy] branch master updated: Reorganize file; use CHECK_CAPTURE values; improve command line args

curcuru Fri, 11 May 2018 11:49:06 -0700

This is an automated email from the ASF dual-hosted git repository.

curcuru pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git



The following commit(s) were added to refs/heads/master by this push:
     new b958494  Reorganize file; use CHECK_CAPTURE values; improve command 
line args
b958494 is described below

commit b958494e866e4aae61d5f1f75e35ac9e1e2d29b1
Author: Shane Curcuru <[email protected]>
AuthorDate: Fri May 11 14:48:33 2018 -0400

    Reorganize file; use CHECK_CAPTURE values; improve command line args
---
 tools/site-scan.rb | 216 +++++++++++++++++++++++++----------------------------
 1 file changed, 103 insertions(+), 113 deletions(-)

diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index 8e75b35..88cd3d4 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -1,66 +1,86 @@
 #!/usr/bin/env ruby
-$LOAD_PATH.unshift File.realpath(File.expand_path('../../lib', __FILE__))
-
-#
-# Scans committee pages for compliance with requirements and recommendations:
+# Scans Apache project homepages and captures text|urls for common links
+# Gathers data that can be used to check for policy compliance:
 #   https://www.apache.org/foundation/marks/pmcs#navigation
 #   http://www.apache.org/events/README.txt
+#   See Also: lib/whimsy/sitestandards.rb
 #
 # Makes no value judgements.  Simply extracts raw data for offline analysis.
-#
-
-require 'whimsy/asf'
+$LOAD_PATH.unshift File.realpath(File.expand_path('../../lib', __FILE__))
 require 'net/http'
 require 'nokogiri'
 require 'json'
-
+require 'whimsy/asf'
 require 'whimsy/cache'
+require 'whimsy/sitestandards'
 
+IMAGE_DIR = ASF::SVN.find('site-img')
+
+# Normalize spaces in text runs
 def squash(text)
-  text.scrub.gsub(/[[:space:]]+/, ' ').strip
+  return text.scrub.gsub(/[[:space:]]+/, ' ').strip
 end
 
-#########################################################################
-
-IMAGE_DIR = ASF::SVN.find('site-img')
+# Get text from a node; use parent if text does not appear to be complete
+# This is used when scanning for some links that may 
+#   be in an image or other commonly related node on websites
+def getText(txt, node, match=/Apache Software Foundation/i)
+  parent = nil # debug to show where parent needed to be fetched
+  if not txt =~ match # have we got all the text?
+    if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra 
text.
+      newnode = node.parent.parent
+    else
+      newnode = node.parent
+    end
+    # ensure <br> is treated as a separator when extracting the combined text
+    newnode.css('br').each{ |br| br.replace(" ") }
+    txt = squash(newnode.text)
+    parent = true
+  end
+  return txt, parent
+end
 
+# Parse an Apache project website and return text|urls that match our checks
+# @return Hash of symbols: text|url found from a check made
+# @see SiteStandards for definitions of what we should scan for (in general)
 def parse(id, site, name)
+  data = {}
+  SiteStandards::COMMON_CHECKS.keys.each do |k|
+    data[k.to_sym] = nil
+  end
+  data[:display_name] = name
+  data[:uri] = site
   uri = URI.parse(site)
-    
-  # default data
-  data = {
-    display_name: name,
-    uri: site,
-    events: nil,
-    foundation: nil,
-    license: nil,
-    sponsorship: nil,
-    security: nil,
-    trademarks: nil,
-    copyright: nil,
-    image: nil,
-  }
-
-  # check if site exists
   begin
     Socket.getaddrinfo(uri.host, uri.scheme)
-  rescue SocketError
+  rescue SocketError => se
+    data[:errors] = se.message
+    return data
+  end
+  begin
+    uri, response, status = $cache.get(site.to_s)
+  rescue IOError => ioe
+    data[:errors] = ioe.message
     return data
   end
-
-  uri, response, status = $cache.get(site.to_s)
   $stderr.puts "#{id} #{uri} #{status}"
-  return data if response.respond_to? :code and response.code =~ /^[45]/ 
+  # Bail and return if getting the site returns an error code
+  if response.respond_to? :code and response.code =~ /^[45]/ 
+    data[:errors] = "cache.get(#{site.to_s}) error code #{response.code}"
+    return data
+  end
   doc = Nokogiri::HTML(response)
   data[:uri] = uri.to_s
 
-  # scan each link
+  # FIRST: scan each link's a_href to see if we need to capture it
   doc.css('a').each do |a|
-
-    # check the link targets
+    # Normalize the text and href for our capture purposes
     a_href = a['href'].to_s.strip
+    a_text = a.text.downcase.strip
+    $stderr.puts "#{a_text} #{a_href}" if $verbose
 
-    if a_href =~ %r{^https?://(www\.)?apache\.org/?$}
+    # Check the href urls for some patterns
+    if a_href =~ 
SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
       img = a.at('img')
       if img
         # use the title (hover text) in preference to the source
@@ -70,7 +90,7 @@ def parse(id, site, name)
       end
     end
 
-    if a_href.include? 'apache.org/events/'
+    if a_href =~ 
SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
       img = a.at('img')
       if img
         data[:events] = uri + img['src'].strip
@@ -79,71 +99,52 @@ def parse(id, site, name)
       end
     end
 
-    # check the link text
-    a_text = a.text.downcase.strip
-    $stderr.puts "#{a_text} #{a_href}" if $verbose
-
-    # Link text is supposed to be just "License" according to:
-    # https://www.apache.org/foundation/marks/pmcs#navigation
-    if a_text =~ /^license$/ and a_href.include? 'apache.org'
+    # Check the a_text strings for other patterns
+    # Note this is an unusual case
+    if (a_text =~ 
SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and 
+        (a_href =~ 
SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
       begin
         data[:license] = uri + a_href 
       rescue
         data[:license] = a_href
       end
     end
-
-    if a_text =~ /\Athanks[!]?\z/ # Allow Thanks! with exclamation
-      begin
-        data[:thanks] = uri + a_href 
-      rescue
-        data[:thanks] = a_href
-      end
-    end
-
-    if a_text == 'security'
-      begin
-        data[:security] = uri + a_href 
-      rescue
-        data[:security] = a_href
-      end
-    end
-
-    if a_text =~ 
%r{sponsorship|donate|sponsor\sapache|sponsoring\sapache|sponsor}
-      begin
-        data[:sponsorship] = uri + a_href
-      rescue
-        data[:sponsorship] = a_href
+    
+    %w(thanks security sponsorship).each do |check|
+      if a_text =~ 
SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
+        begin
+          data[check.to_sym] = uri + a_href 
+        rescue
+          data[check.to_sym] = a_href
+        end
       end
     end
   end
 
-  # Now scan the page text
+  # SECOND: scan each text node to match and capture
   doc.traverse do |node|
     next unless node.is_a?(Nokogiri::XML::Text)
-
     txt = squash(node.text)
-
     # allow override if phrase looks good
-    if (txt =~ /\btrademarks\b/  and not data[:trademarks]) or txt =~/are 
trademarks of [Tt]he Apache Software/
+    if (txt =~ 
SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and 
not data[:trademarks]) or txt =~/are trademarks of [Tt]he Apache Software/
       t, p = getText(txt, node)
       # drop previous text if it looks like Copyright sentence
       data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/,'').strip
       data[:tradeparent] = p if p
     end
-    if txt =~ /Copyright / or txt =~ /©/
+    if txt =~ 
SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
       t, p = getText(txt, node)
       # drop text around the Copyright (or the symbol)
       data[:copyright] = t.sub(/^.*?((Copyright|©) .+? 
Foundation[.]?).*/,'\1').strip
       data[:copyparent] = p if p
     end
-    if txt =~ / Incubation is required of all newly accepted projects /
+    # Note we also check for incubator disclaimer (immaterial of tlp|podling)
+    if txt =~ 
SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
       t, p = getText(txt, node, / is an effort undergoing/)
       data[:disclaimer] = t
     end
   end
-
-  # see if image has been uploaded
+  # THIRD: see if an image has been uploaded
   if IMAGE_DIR
     data[:image] = Dir[File.join(IMAGE_DIR, "#{id}.*")].
       map {|path| File.basename(path)}.first
@@ -152,65 +153,52 @@ def parse(id, site, name)
   return data
 end
 
-# get the text; use parent if text does not appear to be complete
-def getText(txt, node, match=/Apache Software Foundation/i)
-  parent = nil # debug to show where parent needed to be fetched
-  if not txt =~ match # have we got all the text?
-    if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra 
text.
-      newnode = node.parent.parent
-    else
-      newnode = node.parent
-    end
-    # ensure <br> is treated as a separator when extracting the combined text
-    newnode.css('br').each{ |br| br.replace(" ") }
-    txt = squash(newnode.text)
-    parent = true
-  end
-  return txt, parent
-end
-
-$verbose = ARGV.delete '--verbose'
-
+#########################################################################
+# Main execution begins here
 results = {}
-
 podlings = {}
-
 $cache = Cache.new(dir: 'site-scan')
+$verbose = ARGV.delete '--verbose'
 
-# Parse a single site given its URL
-if (1..2).include? ARGV.length and ARGV.first =~ /^https?:\/\/\w/
+# USAGE:
+# site-scan.rb https://whimsical.apache.org [whimsy] [whimsy-scan.json] - to 
scan one project
+# site-scan.rb [project-output.json] [podlings-output.json] [projname 
podlingname ...]
+# If additional projname|podlingname are provided, only scans those sites
+if ARGV.first =~ /^https?:\/\/\w/
+  # Scan a single URL provided by user
   site = ARGV.shift
   name = ARGV.shift || site[/\/(\w[^.]*)/, 1].capitalize
+  output_projects = ARGV.shift
   results[name] = parse(name, site, name)
 else
+  # Gather output filenames (if any) and scan various projects
   if ARGV.first =~ %r{[./]} # have we a file name?
-    outfile = ARGV.shift
+    output_projects = ARGV.shift
     if ARGV.first =~ %r{[./]} # have we another file name?
-      outfile2 = ARGV.shift
+      output_podlings = ARGV.shift
     else
-      outfile2 = nil
+      output_podlings = nil
     end
   else
-    outfile = nil
+    output_projects = nil
   end
-  # scan all committees, including non-pmcs
+
+  # Scan committees, including non-pmcs
   ASF::Committee.load_committee_info
   committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq
-  
   committees.sort_by {|committee| committee.name}.each do |committee|
     next unless committee.site
-    # if parameters specified, parse only those names
+    # if more parameters specified, parse only those names
     if ARGV.length > 0
       next unless ARGV.include? committee.name
     end
-
-    # fetch, parse committee site
     results[committee.name] = parse(committee.name, committee.site, 
committee.display_name)
-    
   end
+  
+  # Scan podlings that have a website
   ASF::Podling.list.each do |podling| 
     if podling.status == 'current' and podling.podlingStatus[:website]
-      # if parameters specified, parse only those names
+      # if more parameters specified, parse only those names
       if ARGV.length > 0
         next unless ARGV.include? podling.name
       end
@@ -219,12 +207,14 @@ else
   end
 end
 
-# Output results
-if outfile
-  File.write(outfile, JSON.pretty_generate(results))
+# Output all results
+if output_projects
+  File.write(output_projects, JSON.pretty_generate(results))
 else
   puts JSON.pretty_generate(results)
 end
-if outfile2
-  File.write(outfile2, JSON.pretty_generate(podlings))
+if output_podlings
+  File.write(output_podlings, JSON.pretty_generate(podlings))
+else
+  puts JSON.pretty_generate(podlings)
 end

-- 
To stop receiving notification emails like this one, please contact
[email protected].

[whimsy] branch master updated: Reorganize file; use CHECK_CAPTURE values; improve command line args

Reply via email to