http://www.mediawiki.org/wiki/Special:Code/MediaWiki/90754
Revision: 90754
Author: kbrown
Date: 2011-06-25 04:45:52 +0000 (Sat, 25 Jun 2011)
Log Message:
-----------
Start maintenance script for the actual spidering links. Make the internet
archive the default if $wgArchiveLinksConfig['archive_service'] is not set to a
valid option.
Modified Paths:
--------------
trunk/extensions/ArchiveLinks/ArchiveLinks.php
Added Paths:
-----------
trunk/extensions/ArchiveLinks/Spider.php
Modified: trunk/extensions/ArchiveLinks/ArchiveLinks.php
===================================================================
--- trunk/extensions/ArchiveLinks/ArchiveLinks.php 2011-06-25 04:11:20 UTC
(rev 90753)
+++ trunk/extensions/ArchiveLinks/ArchiveLinks.php 2011-06-25 04:45:52 UTC
(rev 90754)
@@ -54,6 +54,7 @@
$wgArchiveLinksConfig = array (
'archive_service' => 'wikiwix',
'use_multiple_archives' => false,
+ 'run_spider_in_loop' => false,
);
class ArchiveLinks {
@@ -121,12 +122,14 @@
case 'wikiwix':
$link_to_archive =
'http://archive.wikiwix.com/cache/?url=' . $url;
break;
+ case 'webcitation':
+ $link_to_archive = 'http://webcitation.org/query?url='
. $url;
+ break;
case 'internet_archive':
+ default:
$link_to_archive = 'http://wayback.archive.org/web/*/'
. $url;
break;
- case 'webcitation':
- $link_to_archive = 'http://webcitation.org/query?url='
. $url;
- break;
+
}
}
//Note to self: need to fix this to use Html.php instead of direct
html
Added: trunk/extensions/ArchiveLinks/Spider.php
===================================================================
--- trunk/extensions/ArchiveLinks/Spider.php (rev 0)
+++ trunk/extensions/ArchiveLinks/Spider.php 2011-06-25 04:45:52 UTC (rev
90754)
@@ -0,0 +1,79 @@
+<?php
+/**
+ * This class is for the actual spidering and will be calling wget
+ */
+
+$path = getenv( 'MW_INSTALL_PATH' );
+if ( strval( $path ) === '' ) {
+ $path = dirname( __FILE__ ) . '/../..';
+}
+
+require_once "$path/maintenance/Maintenance.php";
+
+class ArchiveLinksSpider extends Maintenance {
+ private $db_master;
+ private $db_slave;
+ private $db_result;
+
+ public function execute() {
+ global $wgArchiveLinksConfig;
+
+ $this->db_master = $this->getDB( DB_MASTER );
+ $this->db_slave = $this->getDB( DB_SLAVE );
+ $this->db_result = array();
+
+ if ( $wgArchiveLinksConfig['run_spider_in_loop'] ) {
+ while ( TRUE ) {
+ if ( ( $url = $this->check_queue() ) !== false ) {
+ //do stuff
+ }
+ sleep(1);
+ }
+ } else {
+ if ( ( $url = $this->check_queue() ) !== false ) {
+ //do stuff
+ }
+ }
+ return null;
+ }
+
+ private function check_queue() {
+ $this->db_result['job-fetch'] = $this->db_slave->select(
'el_archive_queue', '*',
+ '`el_archive_queue`.`delay_time` <= ' . time()
+ . ' AND `el_archive_queue`.`in_progress` = 0'
+ . ' ORDER BY `el_archive_queue`.`queue_id` ASC'
+ . ' LIMIT 1');
+
+ if ( $this->db_result['job-fetch']->numRows() > 0 ) {
+ $row = $this->db_result['job-fetch']->fetchRow();
+
+ //Since we querried the slave to check for dups when we insterted
instead of the master let's check
+ //that the job isn't in the queue twice, we don't want to archive
it twice
+ $this->db_result['dup-check'] = $this->db_slave->select(
'el_archive_queue', '*', '`el_archive_queue`.`url` = "' . $row['url']
+ . '" ORDER BY `el_archive_queue`.`queue_id` ASC' );
+
+ if ( $this->db_result['dup-check']->numRows() > 1 ) {
+ //keep only the original jobs and remove all duplicates
+ $this->db_result['dup-check']->fetchRow();
+ while ( $del_row = $this->db_result['dup-check']->fetchRow() ) {
+ echo 'you have a dup ';
+ var_dump( $del_row );
+ //this is commented for testing purposes, so I don't have
to keep readding the duplicate to my test db
+ //in other words this has a giant "remove before flight"
ribbon hanging from it...
+ //$this->db_master->delete( 'el_archive_queue',
'`el_archive_queue`.`queue_id` = ' . $del_row['queue_id'] );
+ }
+
+ }
+
+ return $row['url'];
+ } else {
+ //there are no jobs to do right now
+ return false;
+ }
+ }
+}
+
+$maintClass = 'ArchiveLinksSpider';
+require_once RUN_MAINTENANCE_IF_MAIN;
+
+?>
\ No newline at end of file
Property changes on: trunk/extensions/ArchiveLinks/Spider.php
___________________________________________________________________
Added: svn:eol-style
+ native
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs