This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new e891f8ac More general link text extraction
e891f8ac is described below

commit e891f8acdcbbca1752225b3f61af1da6643f25f4
Author: Sebb <[email protected]>
AuthorDate: Thu Mar 21 17:44:35 2024 +0000

    More general link text extraction
---
 tools/site-scan.rb | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/site-scan.rb b/tools/site-scan.rb
index b22d830e..6475367c 100755
--- a/tools/site-scan.rb
+++ b/tools/site-scan.rb
@@ -52,6 +52,17 @@ def save_events(data, value)
   end
 end
 
+# Extract link text, skipping invisible stuff (assumed to be a class ending 
with '-sr-only')
+def get_link_text(anode)
+  bits = []
+  anode.traverse do |node|
+    if node.name == 'text'
+      bits << node.text unless node.parent.name == 'span' and  
node.parent.attribute('class')&.value&.end_with? '-sr-only' 
+    end
+end
+  bits.join(' ')
+end
+
 # Parse an Apache project website and return text|urls that match our checks
 # @return Hash of symbols: text|url found from a check made
 # @see SiteStandards for definitions of what we should scan for (in general)
@@ -106,8 +117,7 @@ def parse(id, site, name)
 
     # Normalize the text and href for our capture purposes
     a_href = a['href'].to_s.strip
-    # HACK to fix TsFile; should have better way to filter out such text
-    a_text = a.text.downcase.sub('open in new window', '').strip
+    a_text = get_link_text(a) # Not down-cased yet
     $stderr.puts "#{a_text.inspect} #{a_href}" if $verbose
 
     # Check the href urls for some patterns
@@ -117,7 +127,7 @@ def parse(id, site, name)
         # use the title (hover text) in preference to the source
         data[:foundation] = img['title'] ? squash(img['title']) : uri + 
img['src'].strip
       else
-        data[:foundation] = squash(a.text)
+        data[:foundation] = squash(a_text)
       end
     end
 
@@ -127,6 +137,7 @@ def parse(id, site, name)
     end
 
     # Check the a_text strings for other patterns
+    a_text = a_text.downcase.strip # needs to be downcased here
     # Note this is an unusual case
     if (a_text =~ 
SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
         (a_href =~ 
SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])

Reply via email to