Martineznovo has uploaded a new change for review. https://gerrit.wikimedia.org/r/236429
Change subject: Vaious fixes to grabNewText.php ...................................................................... Vaious fixes to grabNewText.php OMG how did this script even worked? PROBLEM: logevents api parameter letype doesn't accept multiple values. Multiple values work only on wikia but breaks compatibility SOLVED: by removing the filter, since it doesn't matter to have additional entries anyway PROBLEM: On 1.25 inserts on page fail because page_counter was removed from core in 1.25 SOLVED: Check if wiki supports page counters before attempting to use this field PROBLEM: rev_parent_id was always 0 on inserted revisions, resulting in wrong delta sizes in page histories, and contributions/recent changes treating each change as page creation. SOLVED: Now it won't insert a 0 but a null, so you can run populateParentId.php to fix only the newly added entries. PROBLEM: minor edits weren't marked as such SOLVED: Caused by a typo in the $revision variable name when retrieving the property from api PROBLEM: If a page is created and then moved within the scope of recent changes that grabs the script, the move fails because it attempts to do the move on a not-yet-existing page. Edits are inserted but on the original page title rather than the new one SOLVED: - Ignore page move if the title before the move doesn't exist in our database. - Added processing of log entries in recentchanges to also grab the original page if the move left a redirect. - TODO: The "Edits are inserted but on the original page title rather than the new one" is not fixed, still... Moved various globals into member variables Change-Id: I14aef39fe896e8d07fa5eba3b2ce546ba9deaf01 --- M grabNewText.php 1 file changed, 181 insertions(+), 141 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/tools/grabbers refs/changes/29/236429/1 diff --git a/grabNewText.php b/grabNewText.php index cc5d5ec..90082f2 100755 --- a/grabNewText.php +++ b/grabNewText.php @@ -7,7 +7,8 @@ * @ingroup Maintenance * @author Jack Phoenix <[email protected]> * @author Calimonious the Estrange - * @version 0.6 + * @author Jesús Martínez <[email protected]> + * @version 0.7 * @date 1 January 2013 */ @@ -21,6 +22,56 @@ require_once( 'mediawikibot.class.php' ); class GrabNewText extends Maintenance { + + /** + * Whether our wiki supports page counters, to use counters if remote wiki also has them + * + * @var bool + */ + protected $supportsCounters; + + /** + * Start date + * + * @var string + */ + protected $startDate; + + /** + * End date + * + * @var string + */ + protected $endDate; + + /** + * Last revision in the current db + * + * @var int + */ + protected $lastRevision = 0; + + /** + * Handle to the reader database connection + * + * @var DatabaseBase + */ + protected $dbw; + + /** + * Handle to the writer database connection + * + * @var DatabaseBase + */ + protected $dbr; + + /** + * MediaWikiBot instance + * + * @var MediaWikiBot + */ + protected $bot; + public function __construct() { parent::__construct(); $this->mDescription = "Grab new changes from an external wiki and add it over an imported dump.\nFor use when the available dump is slightly out of date."; @@ -33,7 +84,7 @@ } public function execute() { - global $bot, $endDate, $startDate, $wgDBname, $lastRevision; + global $wgDBname; $url = $this->getOption( 'url' ); if( !$url ) { $this->error( "The URL to the source wiki\'s api.php must be specified!\n", true ); @@ -42,39 +93,46 @@ $user = $this->getOption( 'username' ); $password = $this->getOption( 'password' ); - $startDate = $this->getOption( 'startdate' ); - if ( $startDate ) { - if ( !wfTimestamp( TS_ISO_8601, $startDate ) ) { + $this->startDate = $this->getOption( 'startdate' ); + if ( $this->startDate ) { + if ( !wfTimestamp( TS_ISO_8601, $this->startDate ) ) { $this->error( "Invalid startdate format.\n", true ); } } else { $this->error( "A timestamp to start from is required.\n", true ); } - $endDate = $this->getOption( 'enddate' ); - if ( $endDate ) { - if ( !wfTimestamp( TS_ISO_8601, $endDate ) ) { + $this->endDate = $this->getOption( 'enddate' ); + if ( $this->endDate ) { + if ( !wfTimestamp( TS_ISO_8601, $this->endDate ) ) { $this->error( "Invalid enddate format.\n", true ); } } else { - $endDate = wfTimestampNow(); + $this->endDate = wfTimestampNow(); } + + # Check if wiki supports page counters (removed from core in 1.25) + $this->dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 'db', $wgDBname ) ); + $this->supportsCounters = $this->dbr->fieldExists( 'page', 'page_counter', __METHOD__ ); + + # Get a single DB_MASTER connection + $this->dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', $wgDBname ) ); # bot class and log in if requested if ( $user && $password ) { - $bot = new MediaWikiBot( + $this->bot = new MediaWikiBot( $url, 'json', $user, $password, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:13.0) Gecko/20100101 Firefox/13.0.1' ); - if ( !$bot->login() ) { + if ( !$this->bot->login() ) { print "Logged in as $user...\n"; } else { print "Warning - failed to log in as $user.\n"; } } else { - $bot = new MediaWikiBot( + $this->bot = new MediaWikiBot( $url, 'json', '', @@ -97,33 +155,31 @@ * */ function processRecentChanges() { - global $wgDBname, $endDate, $startDate, $bot; $blackList = array(); # Don't get new edits for these $more = true; $count = 0; # Get last revision id to avoid duplicates - $dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 'db', $wgDBname ) ); - $result = (int)$dbr->selectField( + $result = (int)$this->dbr->selectField( 'revision', 'rev_id', array(), __METHOD__, array( 'ORDER BY' => 'rev_id DESC' ) ); - $lastRevision = $result; + $this->lastRevision = $result; # Get edits $params = array( 'list' => 'recentchanges', 'rcdir' => 'newer', - 'rctype' => 'edit|new', + 'rctype' => 'edit|new|log', 'rclimit' => 'max', 'rcprop' => 'title|sizes|redirect|ids', - 'rcend' => $endDate + 'rcend' => $this->endDate ); - $rcstart = $startDate; + $rcstart = $this->startDate; $count = 0; $more = true; @@ -131,11 +187,15 @@ while ( $more ) { $params['rcstart'] = $rcstart; - $result = $bot->query( $params ); + $result = $this->bot->query( $params ); if ( empty( $result['query']['recentchanges'] ) ) { $this->output( 'No changes found...', true ); } foreach ( $result['query']['recentchanges'] as $entry ) { + # log entries may have page id 0, skip those to prevent processRevision to break + if ( $entry['pageid'] === 0 ) { + continue; + } # new pages, new uploads, edited pages # while more, parse into $pagesList if ( ( $count % 500 ) == 0 ) { @@ -167,7 +227,7 @@ if ( isset( $entry['redirect'] ) ) { $pageInfo['redirect'] = 1; } - $this->processPage( $pageInfo, $startDate ); + $this->processPage( $pageInfo, $this->startDate ); $count++; } @@ -186,30 +246,30 @@ * */ function processRecentLogs () { - global $bot, $endDate, $wgDBname, $startDate; $params = array( 'list' => 'logevents', 'ledir' => 'newer', - 'letype' => 'delete|move|import', + #letype doesn't accept multiple values. Multiple values works only on wikia but breaks on other standard wikis + #'letype' => 'delete|move|import', 'lelimit' => 'max', - 'leend' => $endDate + 'leend' => $this->endDate ); $lestart = null; + $count = 0; $more = true; $this->output( "Updating deleted and moved items...\n" ); while ( $more ) { if ( $lestart === null ) { - $params['lestart'] = $startDate; + $params['lestart'] = $this->startDate; } else { $params['lestart'] = $lestart; } - $result = $bot->query( $params ); + $result = $this->bot->query( $params ); if ( empty( $result['query']['logevents'] ) ) { $this->output( "No changes found...\n", true ); } else { - $dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', $wgDBname ) ); foreach ( $result['query']['logevents'] as $logEntry ) { if ( ( $count % 500 ) == 0 ) { $this->output( "$count\n" ); @@ -228,6 +288,7 @@ $newns = $logEntry['move']['new_ns']; $newTitle = $this->sanitiseTitle( $newns, $logEntry['move']['new_title'] ); if ( !$pageID ) { + # When we don't leave redirect, the log event has pageid 0, otherwise it has the id of the redirect page that it creates $pageID = $this->getPageID( $ns, $title ); if ( !$pageID ) { $this->output( "$ns:$title not found in database.\n" ); @@ -240,21 +301,26 @@ $source = Title::newFromText( $logEntry['title'] ); $dest = Title::newFromText( $logEntry['move']['new_title'] ); - $dbw->begin( __METHOD__ ); - $err = $source->moveTo( $dest, false, '', $redirect ); - if ( $err !== true ) { - $msg = array_shift( $err[0] ); - $this->output( "\nFAILED: " . wfMessage( $msg, $err[0] )->text() ); + if ( $source->exists() ) { + $this->dbw->begin( __METHOD__ ); + $err = $source->moveTo( $dest, false, '', $redirect ); + if ( $err !== true ) { + $msg = array_shift( $err[0] ); + $this->output( "\nFAILED: " . wfMessage( $msg, $err[0] )->text() ); + } + $this->dbw->commit( __METHOD__ ); + } else { + # FIXME: It doesn't work when the page has been created during the recent changes scope, because the page still doesn't exist in our DB + # On the recentchanges processing stage, processPage will import it, but the log entry will be lost! + # TODO: Create raw log entry + $this->output( "$ns:$title moved before we inserted the page on the database!.\n" ); } - $dbw->commit( __METHOD__ ); - } - elseif ( $logEntry['action'] == 'delete' ) { + } elseif ( $logEntry['action'] == 'delete' ) { $this->output( "$ns:$title was deleted; updating....\n" ); # Delete our copy, move revisions -> archive - $this->updateDeleted( $ns, $title, $dbw ); - } - elseif ( $logEntry['action'] == 'restore' ) { + $this->updateDeleted( $ns, $title ); + } elseif ( $logEntry['action'] == 'restore' ) { $this->output( "$ns:$title was undeleted; updating....\n" ); # Remove any revisions from archive and process as new $page = $this->updateRestored( $ns, $title, $dbw ); @@ -264,9 +330,9 @@ $this->output( "$ns:$title processed.\n" ); $blackList[] = $ns."cowz".$title; } - } - elseif ( $logEntry['action'] == 'upload' ) { + } elseif ( $logEntry['action'] == 'upload' ) { $this->output( "$ns:$title was imported; updating....\n" ); + $pageID = $logEntry['pageid']; # Process as new if ( !$pageID ) { $pageID = ''; @@ -304,7 +370,6 @@ * defined, protection stuff is skipped. */ function processPage( $page, $start = null ) { - global $wgDBname, $bot, $endDate; $pageID = $page['pageid']; $title = $page['title']; @@ -320,11 +385,9 @@ } $title = str_replace( ' ', '_', $title ); - $dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', $wgDBname ) ); if ( $start ) { # Check if title is present - $dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 'db', $wgDBname ) ); - $result = $dbr->select( + $result = $this->dbr->select( 'page', 'page_id', array( @@ -333,21 +396,21 @@ ), __METHOD__ ); - $row = $dbr->fetchObject( $result ); + $row = $this->dbr->fetchObject( $result ); if ( $row ) { $localID = $row->page_id; $titleIsPresent = true; } else { # Check if id is present - $result = $dbr->select( + $result = $this->dbr->select( 'page', 'page_title', array( 'page_id' => $pageID ), __METHOD__ ); - if ( $dbr->fetchObject( $result ) ) { - $resid = (int)$dbr->selectField( + if ( $this->dbr->fetchObject( $result ) ) { + $resid = (int)$this->dbr->selectField( 'page', 'page_id', array(), @@ -372,7 +435,7 @@ 'expiry' => ( $prot['expiry'] == 'infinity' ? 'infinity' : wfTimestamp( TS_MW, $prot['expiry'] ) ), 'id' => null ); - $dbw->insert( + $this->dbw->insert( 'page_restrictions', array( 'pr_page' => $e['page'], @@ -385,7 +448,7 @@ ), __METHOD__ ); - $dbw->commit(); + $this->dbw->commit(); # $this->output( "Committed page_restrictions changes.\n" ); } } @@ -405,7 +468,6 @@ # Retrieving the list of revisions, including text. $revision_latest; - $last_rev_id = 0; $more = true; $rvcontinue = null; # 'rvcontinue' in 1.20+, 'rvstartid' in 1.19- @@ -418,7 +480,7 @@ 'rvlimit' => 'max', 'rvprop' => 'ids|flags|timestamp|user|userid|comment|content', 'rvdir' => 'newer', - 'rvend' => wfTimestamp( TS_ISO_8601, $endDate ) + 'rvend' => wfTimestamp( TS_ISO_8601, $this->endDate ) ); if ( $start ) { $params['rvstart'] = wfTimestamp( TS_ISO_8601, $start ); @@ -430,13 +492,13 @@ $params[$rvcontinuename] = $rvcontinue; } - $result = $bot->query( $params ); + $result = $this->bot->query( $params ); if ( isset( $result['query']['pages'] ) ) { $revisions = array_values( $result['query']['pages'] ); $revisions = $revisions[0]['revisions']; foreach ( $revisions as $revision ) { - $last_rev_info = $this->processRevision( $revision, $localID, $last_rev_id ); + $last_rev_info = $this->processRevision( $revision, $localID ); } } else { $this->output( "Page id $pageID not found.\n" ); @@ -461,68 +523,49 @@ $page_e['latest'] = $last_rev_info[0]; $page_e['len'] = $last_rev_info[1]; + $insert_fields = array( + 'page_namespace' => $page_e['namespace'], + 'page_title' => $page_e['title'], + 'page_restrictions' => $page_e['restrictions'], + 'page_is_redirect' => $page_e['is_redirect'], + 'page_is_new' => $page_e['is_new'], + 'page_random' => $page_e['random'], + 'page_touched' => $page_e['touched'], + 'page_latest' => $page_e['latest'], + 'page_len' => $page_e['len'] + ); + if ( $this->supportsCounters && $page_e['counter'] ) { + $insert_fields['page_counter'] = $page_e['counter']; + } if ( !$start ) { - $dbw->insert( + $insert_fields['page_id'] = $page_e['id']; + $this->dbw->insert( 'page', - array( - 'page_id' => $page_e['id'], - 'page_namespace' => $page_e['namespace'], - 'page_title' => $page_e['title'], - 'page_restrictions' => $page_e['restrictions'], - 'page_counter' => $page_e['counter'], - 'page_is_redirect' => $page_e['is_redirect'], - 'page_is_new' => $page_e['is_new'], - 'page_random' => $page_e['random'], - 'page_touched' => $page_e['touched'], - 'page_latest' => $page_e['latest'], - 'page_len' => $page_e['len'] - ), + $insert_fields, __METHOD__ ); } else { # update or insert if not present if ( $titleIsPresent ) { $this->output( "Updating page entry $localID\n" ); - $dbw->update( + $this->dbw->update( 'page', - array( - 'page_namespace' => $page_e['namespace'], - 'page_title' => $page_e['title'], - 'page_restrictions' => $page_e['restrictions'], - 'page_counter' => $page_e['counter'], - 'page_is_redirect' => $page_e['is_redirect'], - 'page_is_new' => $page_e['is_new'], - 'page_random' => $page_e['random'], - 'page_touched' => $page_e['touched'], - 'page_latest' => $page_e['latest'], - 'page_len' => $page_e['len'] - ), + $insert_fields, array( 'page_id' => $localID ), __METHOD__ ); } else { $this->output( "Inserting page entry $localID\n" ); - $dbw->insert( + $insert_fields['page_id'] = $localID; + $this->dbw->insert( 'page', - array( - 'page_id' => $localID, - 'page_namespace' => $page_e['namespace'], - 'page_title' => $page_e['title'], - 'page_restrictions' => $page_e['restrictions'], - 'page_counter' => $page_e['counter'], - 'page_is_redirect' => $page_e['is_redirect'], - 'page_is_new' => $page_e['is_new'], - 'page_random' => $page_e['random'], - 'page_touched' => $page_e['touched'], - 'page_latest' => $page_e['latest'], - 'page_len' => $page_e['len'] - ), + $insert_fields, __METHOD__ ); } } - $dbw->commit(); + $this->dbw->commit(); } /** @@ -531,25 +574,24 @@ * @param $revision Array: array retrieved from the API, containing the revision * text, ID, timestamp, whether it was a minor edit or * not and much more - * @param $page_e UNUSED - * @param $prev_rev_id Integer: previous revision ID (revision.rev_parent_id) + * @param $page_id page id of the revision we are going to insert, overriding what + * $revision say */ - function processRevision( $revision, $page_id, $prev_rev_id ) { - global $wgLang, $wgDBname, $lastRevision; + function processRevision( $revision, $page_id ) { + global $wgLang; - if ( $revision['revid'] <= $lastRevision) { + if ( $revision['revid'] <= $this->lastRevision) { # Oops? return false; } # Workaround check if it's already there. - $dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 'db', $wgDBname ) ); - $result = $dbr->select( + $result = $this->dbr->select( 'revision', 'rev_page', array( 'rev_id' => $revision['revid'] ), __METHOD__ ); - if ( $dbr->fetchObject( $result ) ) { + if ( $this->dbr->fetchObject( $result ) ) { # Already in database return false; } @@ -570,43 +612,45 @@ 'user' => $revision['userid'], # May not be accurate to the new wiki, obvious, but whatever. 'user_text' => $revision['user'], 'timestamp' => wfTimestamp( TS_MW, $revision['timestamp'] ), - 'minor_edit' => ( isset( $reisionv['minor'] ) ? 1 : 0 ), + 'minor_edit' => ( isset( $revision['minor'] ) ? 1 : 0 ), 'deleted' => 0, #revdeleted; would need a handler elsewhere for these 'len' => strlen( $text ), - 'parent_id' => ( $prev_rev_id || 0 ) + 'parent_id' => $revision['parentid'], ); - $dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', $wgDBname ) ); + $insert_fields = array( + 'rev_id' => $e['id'], + 'rev_page' => $e['page'], + 'rev_text_id' => $e['text_id'], + 'rev_comment' => $e['comment'], + 'rev_user' => $e['user'], + 'rev_user_text' => $e['user_text'], + 'rev_timestamp' => $e['timestamp'], + 'rev_minor_edit' => $e['minor_edit'], + 'rev_deleted' => $e['deleted'], + 'rev_len' => $e['len'], + ); + + # If we don't preserve ids it's better to not set it up than setting it wrong, + # because it's used to calculate deltas between revisions + #$insert_fields['rev_parent_id'] = $e['parent_id']; + $this->output( "Inserting revision {$e['id']}\n" ); - $dbw->insert( + $this->dbw->insert( 'revision', - array( - 'rev_id' => $e['id'], - 'rev_page' => $e['page'], - 'rev_text_id' => $e['text_id'], - 'rev_comment' => $e['comment'], - 'rev_user' => $e['user'], - 'rev_user_text' => $e['user_text'], - 'rev_timestamp' => $e['timestamp'], - 'rev_minor_edit' => $e['minor_edit'], - 'rev_deleted' => $e['deleted'], - 'rev_len' => $e['len'], - 'rev_parent_id' => $e['parent_id'], - ), + $insert_fields, __METHOD__ ); - $dbw->commit(); + $this->dbw->commit(); return array( $revision['revid'], $e['len'] ); } # Stores revision texts in the text table. function storeText( $text ) { - global $current_text_id, $wgDBname; if ( !isset( $current_text_id ) ) { - $dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 'db', $wgDBname ) ); - $result = $dbr->select( + $result = $this->dbr->select( 'text', 'old_id', '', @@ -616,13 +660,13 @@ 'ORDER BY' => '`text`.`old_id` DESC' ) ); - $row = $dbr->fetchObject( $result ); + $row = $this->dbr->fetchObject( $result ); if ( $row ) { $current_text_id = $row->old_id; } else { $current_text_id = 0; } - $dbr->freeResult( $result ); + $this->dbr->freeResult( $result ); } $current_text_id++; @@ -632,8 +676,7 @@ 'flags' => '' ); - $dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', $wgDBname ) ); - $dbw->insert( + $this->dbw->insert( 'text', array( 'old_id' => $e['id'], @@ -646,8 +689,7 @@ return $current_text_id; } - function updateDeleted( $ns, $title, $dbw ) { - global $wgDBname; + function updateDeleted( $ns, $title ) { $e = array( 'ar_text' => '', 'ar_flags' => '', @@ -661,8 +703,7 @@ } # Get and insert revision data - $dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 'db', $wgDBname ) ); - $result = $dbr->select( + $result = $this->dbr->select( 'revision', array( 'rev_comment', @@ -693,16 +734,16 @@ $e['ar_parent_id'] = $row->rev_parent_id; $e['ar_sha1'] = $row->rev_sha1; - $dbw->insert( 'archive', $e, __METHOD__ ); + $this->dbw->insert( 'archive', $e, __METHOD__ ); } # Delete page and revision entries - $dbw->delete( + $this->dbw->delete( 'page', array( 'page_id' => $e['ar_page_id'] ), __METHOD__ ); - $dbw->delete( + $this->dbw->delete( 'revision', array( 'rev_page' => $e['ar_page_id'] ), __METHOD__ @@ -710,10 +751,10 @@ # Full clean up in general database rebuild. } - function updateRestored( $ns, $title, $dbw ) { + function updateRestored( $ns, $title ) { $pageID = $this->getPageID( $ns, $title ); if ( $pageID ) { - $dbw->delete( + $this->dbw->delete( 'archive', array( 'ar_title' => $title, @@ -737,8 +778,7 @@ # For use with deleted crap that chucks the id; spotty at best. function getPageID( $ns, $title ) { - $dbr = wfGetDB( DB_SLAVE ); - $result = $dbr->select( + $result = $this->dbr->select( 'page', array( 'page_id' ), array( @@ -747,7 +787,7 @@ ), __METHOD__ ); - $row = $dbr->fetchObject( $result ); + $row = $this->dbr->fetchObject( $result ); if ( $row ) { return $row->page_id; } else { -- To view, visit https://gerrit.wikimedia.org/r/236429 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I14aef39fe896e8d07fa5eba3b2ce546ba9deaf01 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/tools/grabbers Gerrit-Branch: master Gerrit-Owner: Martineznovo <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
