This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new 99ab4cca Show host name counts
99ab4cca is described below

commit 99ab4cca5027594a70c4459477b73b3f3c47ad1d
Author: Sebb <[email protected]>
AuthorDate: Sat Apr 30 21:40:14 2022 +0100

    Show host name counts
---
 tools/asf-site-check.rb | 8 ++++++++
 tools/site-scan.rb      | 9 ++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tools/asf-site-check.rb b/tools/asf-site-check.rb
index ee1378cb..b619e9cc 100644
--- a/tools/asf-site-check.rb
+++ b/tools/asf-site-check.rb
@@ -66,6 +66,14 @@ module ASFDOMAIN
       return true # a relative link
     end
   end
+  # Return external host name or nil
+  # extracts hostname and calls asfhost?
+  def self.to_ext_host(url)
+    if url =~ %r{\Ahttps?://(.+?)(/|\z)}i
+      return $1 unless asfhost?($1)
+    end
+    return nil
+  end
 end
 
 if __FILE__ == $0
diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index b519c3a5..9859bd7c 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -148,11 +148,10 @@ def parse(id, site, name)
   data[:image] = ASF::SiteImage.find(id)
 
   # Check for resource loading from non-ASF domains
-  js_urls  = doc.xpath('//script/@src').map(&:content).reject {|x| 
ASFDOMAIN.asfurl? x}
-  css_urls = doc.xpath('//link/@href').map(&:content).reject {|x| 
ASFDOMAIN.asfurl? x}
-  img_urls = doc.xpath('//img/@src').map(&:content).reject {|x| 
ASFDOMAIN.asfurl? x}
-  resources = js_urls.size + css_urls.size + img_urls.size
-  data[:resources] = "Found #{resources} external resources"
+  ext_urls  = doc.xpath('//script/@src', '//link/@href', '//img/@src').
+    map(&:content).map {|x| ASFDOMAIN.to_ext_host x}.compact.tally
+  resources = ext_urls.values.sum
+  data[:resources] = "Found #{resources} external resources: #{ext_urls}"
 
   #  TODO: does not find js references such as:
   #  ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 
'http://www') + '.google-analytics.com/ga.js';

Reply via email to