commit 689a144824199277abf25dc4c0db609d307bab3e
Author: Kornel Benko <[email protected]>
Date:   Sun Jan 3 15:25:09 2016 +0100

    Cmake url tests: Use more sophiticated check for urls.

diff --git a/development/checkurls/CMakeLists.txt 
b/development/checkurls/CMakeLists.txt
index 20af8f9..debedc6 100644
--- a/development/checkurls/CMakeLists.txt
+++ b/development/checkurls/CMakeLists.txt
@@ -28,6 +28,7 @@ add_test(NAME "check_accessible_urls"
     "filesToScan=${LYXFILES_FILE}"
     "ignoredURLS=${CMAKE_CURRENT_SOURCE_DIR}/inaccessibleURLS"
     "ignoredURLS=${CMAKE_CURRENT_SOURCE_DIR}/knownInvalidURLS"
+    "knownToRegisterURLS=${CMAKE_CURRENT_SOURCE_DIR}/knownToRegisterURLS"
     
"summaryFile=${TOP_BINARY_DIR}/Testing/Temporary/LastFailedAccessibleURLS.log")
 
 # Test inaccessible, but revert the error marker (failed <=> passed)
@@ -39,6 +40,7 @@ add_test(NAME "check_inaccessible_urls"
     "filesToScan=${LYXFILES_FILE}"
     "selectedURLS=${CMAKE_CURRENT_SOURCE_DIR}/inaccessibleURLS"
     "revertedURLS=${CMAKE_CURRENT_SOURCE_DIR}/inaccessibleURLS"
+    "knownToRegisterURLS=${CMAKE_CURRENT_SOURCE_DIR}/knownToRegisterURLS"
     
"summaryFile=${TOP_BINARY_DIR}/Testing/Temporary/LastFailedInaccessibleURLS.log")
 
 #
@@ -50,6 +52,7 @@ add_test(NAME "check_invalid_urls"
   COMMAND ${PERL_EXECUTABLE} "${SEARCH_URL_SCRIPT}"
     "extraURLS=${CMAKE_CURRENT_SOURCE_DIR}/knownInvalidURLS"
     "revertedURLS=${CMAKE_CURRENT_SOURCE_DIR}/knownInvalidURLS"
+    "knownToRegisterURLS=${CMAKE_CURRENT_SOURCE_DIR}/knownToRegisterURLS"
     
"summaryFile=${TOP_BINARY_DIR}/Testing/Temporary/LastFailedKnownInvalidURLS.log")
 
 set(URL_TEST_NAMES "check_accessible_urls" "check_inaccessible_urls" 
"check_invalid_urls")
diff --git a/development/checkurls/CheckURL.pm 
b/development/checkurls/CheckURL.pm
index b7d0e03..d0a8fad 100755
--- a/development/checkurls/CheckURL.pm
+++ b/development/checkurls/CheckURL.pm
@@ -30,26 +30,11 @@ sub check_url($);
 
 sub check_http_url($$$$)
 {
-  use Net::HTTP;
-  use Net::HTTPS;
+  require LWP::UserAgent;
 
   my ($protocol, $host, $path, $file) = @_;
 
-  my $s;
-  if ($protocol eq "http") {
-    $s = Net::HTTP->new(Host => $host, Timeout => 120);
-  }
-  elsif ($protocol eq "https") {
-    $s = Net::HTTPS->new(Host => $host, Timeout => 120);
-  }
-  else {
-    print " Unhandled http protocol \"$protocol\"";
-    return 3;
-  }
-  if (! $s) {
-    print " " . $@;
-    return 3;
-  }
+  my $ua = LWP::UserAgent->new;
   my $getp = "/";
   if ($path ne "") {
     $getp .= $path;
@@ -62,27 +47,32 @@ sub check_http_url($$$$)
       $getp .= "/$file";
     }
   }
-  #print " Trying to use GET  => \"$getp\"";
-  $s->write_request(GET => $getp, 'User-Agent' => "Mozilla/6.0");
-  my($code, $mess, %h) = $s->read_response_headers;
-
-  # Try to read something
   my $buf;
-  my $n = $s->read_entity_body($buf, 1024);
-  if (! defined($n)) {
-    print " Read from \"$protocol://$host$getp\" ";
+  $ua->agent("Firefox/43.0");
+  my $response = $ua->get("$protocol://$host$getp");
+  if ($response->is_success) {
+    $buf = $response->decoded_content;
+  }
+  else {
+    print " " . $response->status_line . ": ";
     return 3;
   }
-  if ($buf =~ /\<title\>([^\<]*404[^\<]*)\<\/title\>/i) {
+  my @title = ();
+  my $res = 0;
+  while ($buf =~ s/\<title\>([^\<]*)\<\/title\>//i) {
     my $title = $1;
-    $title =~ s/\n/ /g;
-    print "title = \"$title\"\n";
-    if ($title =~ /Error 404|404 Not Found/) {
-      print " Page reports 'Error 404' from \"$protocol://$host$getp\" ";
-      return 3;
+    $title =~ s/[\r\n]/ /g;
+    $title =~ s/  +/ /g;
+    $title =~ s/^ //;
+    $title =~ s/ $//;
+    push(@title, $title);
+    print "title = \"$title\": ";
+    if ($title =~ /Error 404|Not Found/) {
+      print " Page reports 'Not Found' from \"$protocol://$host$getp\": ";
+      $res = 3;
     }
   }
-  return 0;
+  return $res;
 }
 
 # Returns ($err, $isdir)
diff --git a/development/checkurls/knownToRegisterURLS 
b/development/checkurls/knownToRegisterURLS
new file mode 100644
index 0000000..5a3fb81
--- /dev/null
+++ b/development/checkurls/knownToRegisterURLS
@@ -0,0 +1,7 @@
+# Urls probably exist, but to check
+# we need to register and login first
+http://www.issn.org/en/node/344
+http://www.springer.de/author/tex/help-journals.html
+http://www.wkap.nl/jrnllist.htm/JRNLHOME
+http://www.wkap.nl/kaphtml.htm/STYLEFILES
+
diff --git a/development/checkurls/search_url.pl 
b/development/checkurls/search_url.pl
index 8b90044..c656dfe 100755
--- a/development/checkurls/search_url.pl
+++ b/development/checkurls/search_url.pl
@@ -59,6 +59,7 @@ my %ignoredURLS = ();
 my %revertedURLS = ();
 my %extraURLS = ();
 my %selectedURLS = ();
+my %knownToRegisterURLS = ();
 my $summaryFile = undef;
 
 my $checkSelectedOnly = 0;
@@ -82,11 +83,14 @@ for my $arg (@ARGV) {
     readUrls($val, %revertedURLS);
   }
   elsif ($type eq "extraURLS") {
-    readUrls($val,  %extraURLS);
+    readUrls($val, %extraURLS);
   }
   elsif ($type eq "selectedURLS") {
     $checkSelectedOnly = 1;
-    readUrls($val,  %selectedURLS);
+    readUrls($val, %selectedURLS);
+  }
+  elsif ($type eq "knownToRegisterURLS") {
+    readUrls($val, %knownToRegisterURLS);
   }
   elsif ($type eq "summaryFile") {
     if (open(SFO, '>', "$val")) {
@@ -104,13 +108,14 @@ my $errorcount = 0;
 my $URLScount = 0;
 
 for my $u (@urls) {
-  if (defined($selectedURLS{$u})) {
-    ${selectedURLS}{$u}->{count} += 1;
-  }
   if (defined($ignoredURLS{$u})) {
     $ignoredURLS{$u}->{count} += 1;
     next;
   }
+  next if (defined($knownToRegisterURLS{$u}));
+  if (defined($selectedURLS{$u})) {
+    ${selectedURLS}{$u}->{count} += 1;
+  }
   next if ($checkSelectedOnly && ! defined($selectedURLS{$u}));
   $URLScount++;
   print "Checking '$u': ";

Reply via email to