http://www.mediawiki.org/wiki/Special:Code/MediaWiki/91970
Revision: 91970
Author: kbrown
Date: 2011-07-12 15:42:39 +0000 (Tue, 12 Jul 2011)
Log Message:
-----------
*remove unnecessary inclusion protection
*start function to parse wget logs
Modified Paths:
--------------
trunk/extensions/ArchiveLinks/ArchiveLinks.class.php
trunk/extensions/ArchiveLinks/ArchiveLinks.i18n.php
trunk/extensions/ArchiveLinks/ArchiveLinks.php
trunk/extensions/ArchiveLinks/SpecialModifyArchiveBlacklist.php
trunk/extensions/ArchiveLinks/SpecialViewArchive.php
trunk/extensions/ArchiveLinks/spider.php
Modified: trunk/extensions/ArchiveLinks/ArchiveLinks.class.php
===================================================================
--- trunk/extensions/ArchiveLinks/ArchiveLinks.class.php 2011-07-12
15:38:07 UTC (rev 91969)
+++ trunk/extensions/ArchiveLinks/ArchiveLinks.class.php 2011-07-12
15:42:39 UTC (rev 91970)
@@ -2,10 +2,6 @@
/**
* Main Extension Class for Archive Links
*/
-if ( !defined( 'MEDIAWIKI' ) ) {
- echo( "This file is an extension to the MediaWiki software and cannot
be used standalone.\n" );
- die( 1 );
-}
class ArchiveLinks {
public static function queueExternalLinks ( &$article ) {
Modified: trunk/extensions/ArchiveLinks/ArchiveLinks.i18n.php
===================================================================
--- trunk/extensions/ArchiveLinks/ArchiveLinks.i18n.php 2011-07-12 15:38:07 UTC
(rev 91969)
+++ trunk/extensions/ArchiveLinks/ArchiveLinks.i18n.php 2011-07-12 15:42:39 UTC
(rev 91970)
@@ -8,8 +8,8 @@
//English
$messages['en'] = array(
'archivelinks-cache-title' => 'cache',
- 'ModifyArchiveBlacklist' => 'Modify Archive Blacklist',
- 'ViewArchive' => 'View Archive',
+ 'modifyarchiveblacklist' => 'Modify Archive Blacklist',
+ 'viewarchive' => 'View External Link Archive',
'archivelinks-modify-blacklist-desc' => 'This page allows you to
blacklist or whitelist URLs for the ArchiveLinks extension.',
//'archivelinks-archive-blacklist-fieldset-label' => 'Blacklist a URL',
'archivelinks-modify-blacklist-url-field-label' => 'URL to Blacklist:',
Modified: trunk/extensions/ArchiveLinks/ArchiveLinks.php
===================================================================
--- trunk/extensions/ArchiveLinks/ArchiveLinks.php 2011-07-12 15:38:07 UTC
(rev 91969)
+++ trunk/extensions/ArchiveLinks/ArchiveLinks.php 2011-07-12 15:42:39 UTC
(rev 91970)
@@ -4,10 +4,6 @@
* This is an extension to archive preemptively archive external links so that
* in the even they go down a backup will be available.
*/
-if ( !defined( 'MEDIAWIKI' ) ) {
- echo( "This file is an extension to the MediaWiki software and cannot
be used standalone.\n" );
- die( 1 );
-}
error_reporting( E_ALL | E_STRICT );
Modified: trunk/extensions/ArchiveLinks/SpecialModifyArchiveBlacklist.php
===================================================================
--- trunk/extensions/ArchiveLinks/SpecialModifyArchiveBlacklist.php
2011-07-12 15:38:07 UTC (rev 91969)
+++ trunk/extensions/ArchiveLinks/SpecialModifyArchiveBlacklist.php
2011-07-12 15:42:39 UTC (rev 91970)
@@ -1,10 +1,5 @@
<?php
-if (!defined('MEDIAWIKI')) {
- echo( "This file is an extension to the MediaWiki software and cannot
be used standalone.\n" );
- die(1);
-}
-
class SpecialModifyArchiveBlacklist extends SpecialPage {
function __construct() {
Modified: trunk/extensions/ArchiveLinks/SpecialViewArchive.php
===================================================================
--- trunk/extensions/ArchiveLinks/SpecialViewArchive.php 2011-07-12
15:38:07 UTC (rev 91969)
+++ trunk/extensions/ArchiveLinks/SpecialViewArchive.php 2011-07-12
15:42:39 UTC (rev 91970)
@@ -3,11 +3,6 @@
* This special page exists to serve the cached versions of the pages that
have been archived.
*/
-if (!defined('MEDIAWIKI')) {
- echo( "This file is an extension to the MediaWiki software and cannot
be used standalone.\n" );
- die(1);
-}
-
class SpecialViewArchive extends SpecialPage {
private $db_master;
private $db_slave;
Modified: trunk/extensions/ArchiveLinks/spider.php
===================================================================
--- trunk/extensions/ArchiveLinks/spider.php 2011-07-12 15:38:07 UTC (rev
91969)
+++ trunk/extensions/ArchiveLinks/spider.php 2011-07-12 15:42:39 UTC (rev
91970)
@@ -45,12 +45,16 @@
}*/
if ( ( $url = $this->check_queue() ) !== false ) {
- switch( $wgArchiveLinksConfig['download_lib'] )
{
- case 'curl':
- die( 'At the current time
support for libcurl is not available.' );
- case 'wget':
- default:
- $this->call_wget( $url );
+ if ( isset(
$wgArchiveLinksConfig['download_lib'] ) ) {
+ switch(
$wgArchiveLinksConfig['download_lib'] ) {
+ case 'curl':
+ die( 'At the current
time support for libcurl is not available.' );
+ case 'wget':
+ default:
+ $this->call_wget( $url
);
+ }
+ } else {
+ $this->call_wget( $url );
}
}
}
@@ -59,45 +63,56 @@
private function call_wget( $url ) {
global $wgArchiveLinksConfig, $path;
- if ( array_key_exists( 'wget_path', $wgArchiveLinksConfig ) &&
file_exists( $wgArchiveLinksConfig['wget_path'] ) ) {
+ if ( isset( $wgArchiveLinksConfig['wget_path'] ) &&
file_exists( $wgArchiveLinksConfig['wget_path'] ) ) {
die ( 'Support is not yet added for wget in a different
directory' );
- } elseif ( file_exists(
"$path/extensions/ArchiveLinks/wget.exe" ) ) {
- if ( array_key_exists( 'file_types',
$wgArchiveLinksConfig ) ) {
+ } elseif ( file_exists( "$path/wget.exe" ) ) {
+ if ( isset( $wgArchiveLinksConfig['file_types'] ) ) {
if ( is_array(
$wgArchiveLinksConfig['file_types']) ){
- $accept_file_types = '-A ' . implode(
',', $wgArchiveLinksConfig['filetypes'] );
+ $accept_file_types = '-A ' . implode(
',', $wgArchiveLinksConfig['file_types'] );
} else {
$accept_file_types = '-A ' .
$wgArchiveLinksConfig['file_types'];
}
} else {
+ //we should set a default, for now we will
disable this for testing purposes, but this should be closed sometime later...
$accept_file_types = '';
}
//At the current time we are only adding support for
the local filestore, but swift support is something that will be added later
- switch( $wgArchiveLinksConfig['filestore'] ) {
+ //Add shutup operator for PHP notice, it's okay if this
is not set as it's an optional config value
+ switch( @$wgArchiveLinksConfig['filestore'] ) {
case 'local':
default:
- if ( array_key_exists(
'subfolder_name', $wgArchiveLinksConfig ) ) {
- $content_dir =
'extensions/ArchiveLinks/' . $wgArchiveLinksConfig['subfolder_name'];
- } elseif (
$wgArchiveLinksConfig['content_path'] ) {
- $content_dir = realpath(
$wgArchiveLinksConfig['content_path'] );
- if ( !$content_dir ) {
+ if ( isset(
$wgArchiveLinksConfig['subfolder_name'] ) ) {
+ $dir = $path .
$wgArchiveLinksConfig['subfolder_name'];
+ } elseif ( isset(
$wgArchiveLinksConfig['content_path'] ) ) {
+ $dir = realpath(
$wgArchiveLinksConfig['content_path'] );
+ if ( !$dir ) {
die ( 'The path you
have set for $wgArchiveLinksConfig[\'content_path\'] does not exist. ' .
'This
makes the spider a very sad panda. Please either create it or use a different
setting.');
}
} else {
- $content_dir =
'extensions/ArchiveLinks/' . 'archived_content/';
+ $dir = $path .
'/archived_content/';
}
- $dir = $path . $content_dir . sha1(
time() . ' - ' . $url );
+ $dir = $dir . sha1( time() . ' - ' .
$url );
+ mkdir( $dir, 0644, TRUE );
$dir = escapeshellarg( $dir );
$sanitized_url = escapeshellarg( $url );
}
- if ( array_key_exists( 'wget_quota',
$wgArchiveLinksConfig ) ) {
- $quota = $wgArchiveLinksConfig['wget_quota'];
- } else {
+
+ if ( ! isset( $wgArchiveLinksConfig['wget_quota'] ) ) {
//We'll set the default max quota for any
specific web page for 8 mb, which is kind of a lot but should allow for large
images
$quota = '8m';
}
- shell_exec( "cd $path/extensions/ArchiveLinks/" );
- shell_exec( "wget.exe -nH -p -H -E -k -Q$quota -P $dir
$accept_file_types $sanitized_url" );
+
+ if ( !isset( $wgArchiveLinksConfig['retry_times'] ) ) {
+ //by default wget is set to retry something 20
times which is probably *way* too high for our purposes
+ //this has the potential to really slow it down
as --waitretry is set to 10 seconds by default, meaning that it would take
+ //serveral minutes to go through all the
retries which has the potential to stall the spider unnecessarily
+ $wgArchiveLinksConfig['retry_times'] = '3';
+ }
+
+ shell_exec( "cd $path" );
+ shell_exec( "wget.exe -nv -p -H -E -k -t
{$wgArchiveLinksConfig['retry_times']} -Q{$wgArchiveLinksConfig['retry_times']}
-o $dir/log.txt -P $dir $accept_file_types $sanitized_url" );
+ $this->parse_wget_log( "$dir/log.txt", $url );
} else {
//this is primarily designed with windows in mind and
no built in wget, so yeah, *nix support should be added, in other words note to
self...
die ( 'wget must be installed in order for the spider
to function in wget mode' );
@@ -173,7 +188,7 @@
$reserve_time =
explode( ' ', $row['in_progress'] );
$reserve_time =
$reserve_time[2];
- array_key_exists(
'in_progress_ignore_delay', $wgArchiveLinksConfig ) ? $ignore_in_prog_time =
$wgArchiveLinksConfig['in_progress_ignore_delay'] :
+ isset(
$wgArchiveLinksConfig['in_progress_ignore_delay'] ) ? $ignore_in_prog_time =
$wgArchiveLinksConfig['in_progress_ignore_delay'] :
$ignore_in_prog_time = 7200;
if ( $time -
$reserve_time - $wait_time > $ignore_in_prog_time ) {
@@ -220,12 +235,55 @@
}
private function reserve_job( $row ) {
+ // this function was pulled out of replication_check_queue,
need to fix the vars in here
$this->jobs['execute_urls'][] = $row['url'];
- $this->db_master->update( 'el_archive_queue', array(
$row['in_progress'] => "\"$pid\"" ), array( 'queue_id' => $row['queue_id'] ),
+ $this->db_master->update( 'el_archive_queue', array(
$row['in_progress'] => "\"{$this->jobs['pid']}\"" ), array( 'queue_id' =>
$row['queue_id'] ),
__METHOD__ ) or die( 'can\'t reserve job' );
$this->delete_dups( $row['url'] );
return true;
}
+
+ private function parse_wget_log( $log_path, $url ) {
+ $fp = fopen( $log_path, 'r' ) or die( 'can\'t find wget log
file to parse' );
+
+ $downloaded_files = array ( 'failed' => array(), 'success' =>
array() );
+
+ while ( $line = fgets( $fp ) ) {
+ $line_regexes = array (
+ 'url' => '%\^d{4}-(?:\d{2}-?){2} (?:\d{2}:?){3}
URL:(http://.*) \[.+\] ->%',
+ 'finish' => '%^Downloaded: \d+ files,
(\d+(?:K|M)).*%',
+ 'sole_url' => '%^(http://.*):%',
+ 'error' => '%^\d{4}-(?:\d{2}-?){2}
(?:\d{2}:?){3} ERROR (\d){3}:(.+)%',
+
+ );
+ foreach( $line_regexes as $line_type => $regex ) {
+ if ( preg_match( $regex, $line, $matches ) ) {
+ switch ( $line_type ) {
+ case 'url':
+
$downloaded_files['success'][] = $matches[1];
+ $last_line = 'url';
+ break;
+ case 'sole_url':
+
$downloaded_files['failed'][]['url'] = $matches[1];
+ break;
+ case 'error':
+ end(
$downloaded_files['failed'] );
+ $array_key = key(
$downloaded_files['failed'] );
+
$downloaded_files['failed'][$array_key]['error_code'] = $matches[1];
+
$downloaded_files['failed'][$array_key]['error_text'] = $matches[2];
+ break;
+ case 'finish':
+ $finish_time =
$matches[1];
+ break;
+ default:
+ //we missed a line
type, this is mainly for testing purposes and shouldn't happen when parsing the
log
+ echo "\n\nUNKNOWN LINE:
$line\n\n";
+ break;
+ }
+ }
+ }
+ }
+ }
}
$maintClass = 'ArchiveLinksSpider';
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs