Martineznovo has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/357224 )

Change subject: Improvements to text grabbers
......................................................................

Improvements to text grabbers

grabNewText:
- Added handling of tags (ported from grabText)
- Added handling of content model (untested due to using 1.19 as source wiki)
- Added handling of revdeleted entries (ported from grabText)
- Added handling of external storage on destination wiki
- Check of previous revision for inserting the same text ID has been
  simplified to select only the previous adjacent revision of the page
  so it limits the quantity of rows it needs to query

grabText:
- Overall rewrite to mimick grabNewText good practices: Use member variables
  instead of globals, and methods from processPage to the end have been copied
  from grabNewText.php (with minor cleanup of unused parameters in grabText.php)
- Added namespaces parameter to limit which namespaces to grab
- Added start parameter to resume a grab from a given page
- Added handling of content model (untested due to using 1.19 as source wiki)
- Added handling of external storage on destination wiki

mediawikibot:
- Using an array for retry times instead of a dirty nested if, for ease of 
expanding
- Better error handling. Don't assume an array with the given query key will be
  returned, since this is not true when using generators that can return empty 
sets,
  which was causing such responses being marked as errors and retried.
- Documented that login returns null if succeeded (lol) and better handle 
errors there.

Change-Id: Id8ad5a991823fcb19b4779eaaa6528f8cdc27a9b
---
M grabNewText.php
M grabText.php
M mediawikibot.class.php
3 files changed, 807 insertions(+), 445 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/tools/grabbers 
refs/changes/24/357224/1

diff --git a/grabNewText.php b/grabNewText.php
index 41fd0c0..2ec2c6b 100755
--- a/grabNewText.php
+++ b/grabNewText.php
@@ -195,6 +195,7 @@
                $this->processRecentLogs();
                $this->processRecentChanges();
 
+               $this->output( "\nDone.\n" );
                # Done.
        }
 
@@ -415,6 +416,8 @@
         *     revisions that should be already in the database
         */
        function processPage( $page, $start = null, $skipPrevious = true ) {
+               global $wgContentHandlerUseDB;
+
                $pageID = $page['pageid'];
                $pageTitle = null;
                $pageDesignation = "id $pageID";
@@ -429,7 +432,7 @@
                $params = array(
                        'prop' => 'info|revisions',
                        'rvlimit' => 'max',
-                       'rvprop' => 
'ids|flags|timestamp|user|userid|comment|content',
+                       'rvprop' => 
'ids|flags|timestamp|user|userid|comment|content|tags',
                        'rvdir' => 'newer',
                        'rvend' => wfTimestamp( TS_ISO_8601, $this->endDate )
                );
@@ -444,6 +447,9 @@
                }
                if ( $page['protection'] ) {
                        $params['inprop'] = 'protection';
+               }
+               if ( $wgContentHandlerUseDB ) {
+                       $params['rvprop'] = $params['rvprop'] . '|contentmodel';
                }
 
                $result = $this->bot->query( $params );
@@ -482,6 +488,7 @@
                        'random' => wfRandom(),
                        'touched' => wfTimestampNow(),
                        'len' => 0,
+                       'content_model' => null
                );
                # Trim and convert displayed title to database page title
                # Get it from the returned value from api
@@ -494,6 +501,17 @@
                $page_e['len'] = $info_pages[0]['length'];
                $page_e['counter'] = ( isset( $info_pages[0]['counter'] ) ? 
$info_pages[0]['counter'] : 0 );
                $page_e['latest'] = $info_pages[0]['lastrevid'];
+               $defaultModel = null;
+               if ( $wgContentHandlerUseDB && isset( 
$info_pages[0]['contentmodel'] ) ) {
+                       # This would be the most accurate way of getting the 
content model for a page.
+                       # However it calls hooks and can be incredibly slow or 
cause errors
+                       #$defaultModel = ContentHandler::getDefaultModelFor( 
Title:makeTitle( $page_e['namespace'], $page_e['title'] ) );
+                       $defaultModel = MWNamespace::getNamespaceContentModel( 
$info_pages[0]['ns'] ) || CONTENT_MODEL_WIKITEXT;
+                       # Set only if not the default content model
+                       if ( $defaultModel != $info_pages[0]['contentmodel'] ) {
+                               $page_e['content_model'] = 
$info_pages[0]['contentmodel'];
+                       }
+               }
 
                # Check if page is present
                $pageIsPresent = false;
@@ -556,7 +574,7 @@
                while ( true ) {
                        foreach ( $info_pages[0]['revisions'] as $revision ) {
                                if ( ! $skipPrevious || $revision['revid'] > 
$this->lastRevision) {
-                                       $revisionsProcessed = 
$this->processRevision( $revision, $pageID ) || $revisionsProcessed;
+                                       $revisionsProcessed = 
$this->processRevision( $revision, $pageID, $defaultModel ) || 
$revisionsProcessed;
                                } else {
                                        $this->output( sprintf( "Skipping the 
processRevision of revision %d minor or equal to the last revision of the 
database (%d).\n", $revision['revid'], $this->lastRevision ) );
                                }
@@ -592,7 +610,8 @@
                        'page_random' => $page_e['random'],
                        'page_touched' => $page_e['touched'],
                        'page_latest' => $page_e['latest'],
-                       'page_len' => $page_e['len']
+                       'page_len' => $page_e['len'],
+                       'page_content_model' => $page_e['content_model']
                );
                if ( $this->supportsCounters && $page_e['counter'] ) {
                        $insert_fields['page_counter'] = $page_e['counter'];
@@ -625,10 +644,11 @@
         * @param array $revision Array retrieved from the API, containing the 
revision
         *     text, ID, timestamp, whether it was a minor edit or not and much 
more
         * @param int $page_id Page ID number of the revision we are going to 
insert
+        * @param string $defaultModel Default content model for this page
         * @return bool Whether revision has been inserted or not
         */
-       function processRevision( $revision, $page_id ) {
-               global $wgLang;
+       function processRevision( $revision, $page_id, $defaultModel ) {
+               global $wgLang, $wgContentHandlerUseDB;
                $revid = $revision['revid'];
 
                # Workaround check if it's already there.
@@ -644,12 +664,37 @@
                        return false;
                }
 
-               $text = $revision['*'];
-               $comment = $revision['comment'];
-               if ( $comment ) {
-                       $comment = $wgLang->truncate( $comment, 255 );
+               # Sloppy handler for revdeletions; just fills them in with 
dummy text
+               # and sets bitfield thingy
+               $revdeleted = 0;
+               if ( isset( $revision['userhidden'] ) ) {
+                       $revdeleted = $revdeleted | Revision::DELETED_USER;
+                       if ( !isset( $revision['user'] )) {
+                               $revision['user'] = ''; # username removed
+                       }
+                       if ( !isset( $revision['userid'] )) {
+                               $revision['userid'] = 0;
+                       }
+               }
+               if ( isset( $revision['commenthidden'] ) ) {
+                       $revdeleted = $revdeleted | Revision::DELETED_COMMENT;
+                       $comment = ''; # edit summary removed
                } else {
-                       $comment = '';
+                       $comment = $revision['comment'];
+                       if ( $comment ) {
+                               $comment = $wgLang->truncate( $comment, 255 );
+                       } else {
+                               $comment = '';
+                       }
+               }
+               if ( isset( $revision['texthidden'] ) ) {
+                       $revdeleted = $revdeleted | Revision::DELETED_TEXT;
+                       $text = ''; # This content has been removed.
+               } else {
+                       $text = $revision['*'];
+               }
+               if ( isset ( $revision['suppressed'] ) ) {
+                       $revdeleted = $revdeleted | 
Revision::DELETED_RESTRICTED;
                }
 
                $e = array(
@@ -660,15 +705,29 @@
                        'user_text' => $revision['user'],
                        'timestamp' => wfTimestamp( TS_MW, 
$revision['timestamp'] ),
                        'minor_edit' => ( isset( $revision['minor'] ) ? 1 : 0 ),
-                       'deleted' => 0, #revdeleted; would need a handler 
elsewhere for these
+                       'deleted' => $revdeleted,
                        'len' => strlen( $text ),
                        'parent_id' => $revision['parentid'],
                        # Do not attempt to get the field from api, because 
it's not what
                        # you'd expect. See T75411
-                       'sha1' => Revision::base36Sha1( $text )
+                       'sha1' => Revision::base36Sha1( $text ),
+                       'content_model' => null,
+                       'content_format' => null
                );
 
-               $e['text_id'] = $this->storeText( $text, $e['sha1'], $page_id );
+               $e['text_id'] = $this->storeText( $text, $e['sha1'], $page_id, 
$revid );
+
+               # Set content model
+               if ( $wgContentHandlerUseDB && isset( $revision['contentmodel'] 
) ) {
+                       # Set only if not the default content model
+                       if ( $defaultModel != $revision['contentmodel'] ) {
+                               $e['content_model'] = $revision['contentmodel'];
+                               $defaultFormat = ContentHandler::getForModelID( 
$defaultModel )->getDefaultFormat();
+                               if ( $defaultFormat != 
$revision['contentformat'] ) {
+                                       $e['content_format'] = 
$revision['contentformat'];
+                               }
+                       }
+               }
 
                $insert_fields = array(
                        'rev_id' => $e['id'],
@@ -683,14 +742,39 @@
                        'rev_len' => $e['len'],
                        'rev_parent_id' => $e['parent_id'],
                        'rev_sha1' => $e['sha1'],
+                       'rev_content_model' => $e['content_model'],
+                       'rev_content_format' => $e['content_format'],
                );
 
-               $this->output( "Inserting revision {$e['id']}\n" );
+               $this->output( sprintf( "Inserting revision %s\n", $e['id'] ) );
                $this->dbw->insert(
                        'revision',
                        $insert_fields,
                        __METHOD__
                );
+
+               # Insert tags, if any
+               if ( isset( $revision['tags'] ) && count( $revision['tags'] ) > 
0 ) {
+                       foreach ( $revision['tags'] as $tag ) {
+                               $this->dbw->insert(
+                                       'change_tag',
+                                       array(
+                                               'ct_rev_id' => $e['id'],
+                                               'ct_tag' => $tag,
+                                       ),
+                                       __METHOD__
+                               );
+                       }
+                       $this->dbw->insert(
+                               'tag_summary',
+                               array(
+                                       'ts_rev_id' => $e['id'],
+                                       'ts_tags' => implode( ',', 
$revision['tags'] ),
+                               ),
+                               __METHOD__
+                       );
+               }
+
                $this->dbw->commit();
 
                return true;
@@ -700,33 +784,34 @@
         * Stores revision text in the text table. If the page ID is provided 
and
         * a revision exists with the same text, it will reuse it instead of
         * creating a duplicate entry in text table.
+        * If configured, stores text in external storage
         *
         * @param string $text Text of the revision to store
         * @param string $sha1 computed sha1 of the text
-        * @param int $pageID page id of the revision
+        * @param int $pageID page id of the revision, used to return the
+        *            previous revision text if it's the same (optional)
+        * @param int $revisionID revision id (optional)
         * @return int text id of the inserted text
         */
-       function storeText( $text, $sha1, $pageID = 0 ) {
+       function storeText( $text, $sha1, $pageID = 0, $revisionID = 0 ) {
+               global $wgDefaultExternalStore;
+
                if ( $pageID ) {
                        # Check first if the text already exists on any 
revision of the current page,
                        # to reuse text rows on page moves, protections, etc
+                       # Return the previous revision from that page
                        $row = $this->dbw->selectRow(
-                               array( 'revision', 'text' ),
-                               array_merge( array( 'rev_text_id' ), 
Revision::selectTextFields() ),
-                               array(
-                                       'rev_page' => $pageID,
-                                       'rev_sha1' => $sha1
-                               ),
+                               array( 'revision' ),
+                               array( 'rev_id', 'rev_sha1', 'rev_text_id' ),
+                               "rev_page = $pageID AND rev_id <= $revisionID",
                                __METHOD__,
-                               array(),
                                array(
-                                       'text' => array( 'INNER JOIN', array( 
'old_id=rev_text_id' ) )
+                                       'LIMIT' => 1,
+                                       'ORDER BY' => 'rev_id DESC'
                                )
                        );
 
-                       # We could have assumed that same sha1 means same text, 
but let's compare it
-                       # anyway just in case
-                       if ( $row && Revision::getRevisionText( $row ) === 
$text ) {
+                       if ( $row && $row->rev_sha1 == $sha1 ) {
                                # Return the existing text id instead of 
creating a new one
                                return $row->rev_text_id;
                        }
@@ -735,6 +820,19 @@
                $this->lastTextId++;
 
                $flags = Revision::compressRevisionText( $text );
+
+               # Write to external storage if required
+               if ( $wgDefaultExternalStore ) {
+                       # Store and get the URL
+                       $text = ExternalStore::insertToDefault( $text );
+                       if ( !$text ) {
+                               throw new MWException( "Unable to store text to 
external storage" );
+                       }
+                       if ( $flags ) {
+                               $flags .= ',';
+                       }
+                       $flags .= 'external';
+               }
 
                $e = array(
                        'id' => $this->lastTextId,
@@ -851,7 +949,7 @@
                $params = array(
                        'list' => 'deletedrevs',
                        'titles' => (string)$pageTitle,
-                       'drprop' => 
'revid|parentid|user|userid|comment|minor|len|content',
+                       'drprop' => 
'revid|parentid|user|userid|comment|minor|len|content|tags',
                        'drlimit' => 'max',
                        'drdir' => 'newer'
                );
@@ -910,13 +1008,6 @@
         */
        function processDeletedRevision( $revision, $ns, $title ) {
                global $wgLang;
-               $text = $revision['*'];
-               $comment = $revision['comment'];
-               if ( $comment ) {
-                       $comment = $wgLang->truncate( $comment, 255 );
-               } else {
-                       $comment = '';
-               }
 
                # Check if archived revision is already there to prevent 
duplicate entries
                if ( $revision['revid'] ) {
@@ -931,6 +1022,39 @@
                        }
                }
 
+               # Sloppy handler for revdeletions; just fills them in with 
dummy text
+               # and sets bitfield thingy
+               $revdeleted = 0;
+               if ( isset( $revision['userhidden'] ) ) {
+                       $revdeleted = $revdeleted | Revision::DELETED_USER;
+                       if ( !isset( $revision['user'] )) {
+                               $revision['user'] = ''; # username removed
+                       }
+                       if ( !isset( $revision['userid'] )) {
+                               $revision['userid'] = 0;
+                       }
+               }
+               if ( isset( $revision['commenthidden'] ) ) {
+                       $revdeleted = $revdeleted | Revision::DELETED_COMMENT;
+                       $comment = ''; # edit summary removed
+               } else {
+                       $comment = $revision['comment'];
+                       if ( $comment ) {
+                               $comment = $wgLang->truncate( $comment, 255 );
+                       } else {
+                               $comment = '';
+                       }
+               }
+               if ( isset( $revision['texthidden'] ) ) {
+                       $revdeleted = $revdeleted | Revision::DELETED_TEXT;
+                       $text = ''; # This content has been removed.
+               } else {
+                       $text = $revision['*'];
+               }
+               if ( isset ( $revision['suppressed'] ) ) {
+                       $revdeleted = $revdeleted | 
Revision::DELETED_RESTRICTED;
+               }
+
                $e = array(
                        'ns' => $ns,
                        'title' => $title,
@@ -940,10 +1064,12 @@
                        'user_text' => $revision['user'],
                        'timestamp' => wfTimestamp( TS_MW, 
$revision['timestamp'] ),
                        'minor_edit' => ( isset( $revision['minor'] ) ? 1 : 0 ),
-                       'deleted' => 0, #revdeleted; would need a handler 
elsewhere for these
+                       'deleted' => $revdeleted,
                        'len' => strlen( $text ),
                        'parent_id' => $revision['parentid'],
-                       'sha1' => Revision::base36Sha1( $text )
+                       'sha1' => Revision::base36Sha1( $text ),
+                       'content_model' => null, # Content handler not 
available for deleted revisions
+                       'content_format' => null
                );
 
                $e['text_id'] = $this->storeText( $text, $e['sha1'] );
@@ -963,14 +1089,39 @@
                        #'ar_page_id' => NULL, # Not requred and unreliable 
from api
                        'ar_parent_id' => $e['parent_id'],
                        'ar_sha1' => $e['sha1'],
+                       'ar_content_model' => $e['content_model'],
+                       'ar_content_format' => $e['content_format']
                );
 
-               $this->output( "Inserting deleted revision {$e['id']}\n" );
+               $this->output( sprintf( "Inserting deleted revision %s\n", 
$e['id'] ) );
                $this->dbw->insert(
                        'archive',
                        $insert_fields,
                        __METHOD__
                );
+
+               # Insert tags, if any
+               if ( isset( $revision['tags'] ) && count( $revision['tags'] ) > 
0 ) {
+                       foreach ( $revision['tags'] as $tag ) {
+                               $this->dbw->insert(
+                                       'change_tag',
+                                       array(
+                                               'ct_rev_id' => $e['id'],
+                                               'ct_tag' => $tag,
+                                       ),
+                                       __METHOD__
+                               );
+                       }
+                       $this->dbw->insert(
+                               'tag_summary',
+                               array(
+                                       'ts_rev_id' => $e['id'],
+                                       'ts_tags' => implode( ',', 
$revision['tags'] ),
+                               ),
+                               __METHOD__
+                       );
+               }
+
                $this->dbw->commit();
        }
 
diff --git a/grabText.php b/grabText.php
index 6166934..5dca9eb 100755
--- a/grabText.php
+++ b/grabText.php
@@ -7,17 +7,57 @@
  * @ingroup Maintenance
  * @author Jack Phoenix <[email protected]>
  * @author Calimonious the Estrange
- * @version 0.6
+ * @author Jesús Martínez <[email protected]>
+ * @version 0.7
  * @date 1 January 2013
  */
 
-# Because we're not in maintenance
+/**
+ * Set the correct include path for PHP so that we can run this script from
+ * $IP/grabbers/ and we don't need to move this file to $IP/maintenance/.
+ */
 ini_set( 'include_path', __DIR__ . '/../maintenance' );
 
 require_once 'Maintenance.php';
 require_once 'mediawikibot.class.php';
 
 class GrabText extends Maintenance {
+
+       /**
+        * Whether our wiki supports page counters, to use counters if remote 
wiki also has them
+        *
+        * @var bool
+        */
+       protected $supportsCounters;
+
+       /**
+        * End date
+        *
+        * @var string
+        */
+       protected $endDate;
+
+       /**
+        * Last text id in the current db
+        *
+        * @var int
+        */
+       protected $lastTextId = 0;
+
+       /**
+        * Handle to the database connection
+        *
+        * @var DatabaseBase
+        */
+       protected $dbw;
+
+       /**
+        * MediaWikiBot instance
+        *
+        * @var MediaWikiBot
+        */
+       protected $bot;
+
        public function __construct() {
                parent::__construct();
                $this->mDescription = "Grab text from an external wiki and 
import it into one of ours.\nDon't use this on a large wiki unless you 
absolutely must; it will be incredibly slow.";
@@ -25,46 +65,62 @@
                $this->addOption( 'username', 'Username to log into the target 
wiki', false, true, 'n' );
                $this->addOption( 'password', 'Password on the target wiki', 
false, true, 'p' );
                $this->addOption( 'db', 'Database name, if we don\'t want to 
write to $wgDBname', false, true );
-               # $this->addOption( 'start', 'Revision number at which to 
start', false, true );
+               $this->addOption( 'start', 'Page at which to start, useful if 
the script stopped at this point', false, true );
                $this->addOption( 'enddate', 'End point (20121222142317, 
2012-12-22T14:23:17T, etc); defaults to current timestamp.', false, true );
-               $this->addOption( 'carlb', 'Tells the script to use lower api 
limits', false, false );
+               $this->addOption( 'namespaces', 'Pipe-separated namespaces (ID) 
to grab. Defaults to all namespaces', false, true );
        }
 
        public function execute() {
-               global $bot, $endDate, $wgDBname, $lastRevision, $skipped;
+               global $wgDBname;
+
                $url = $this->getOption( 'url' );
                if( !$url ) {
-                       $this->error( "The URL to the source wiki\'s api.php 
must be specified!\n", true );
+                       $this->error( "The URL to the source wiki\'s api.php 
must be specified!\n", 1 );
                }
-               $carlb = $this->getOption( 'carlb' );
 
                $user = $this->getOption( 'username' );
                $password = $this->getOption( 'password' );
-               $endDate = $this->getOption( 'enddate' );
-               if ( $endDate ) {
-                       if ( !wfTimestamp( TS_ISO_8601, $endDate ) ) {
-                               $this->error( "Invalid enddate format.\n", true 
);
+
+               $this->endDate = $this->getOption( 'enddate' );
+               if ( $this->endDate ) {
+                       if ( !wfTimestamp( TS_ISO_8601, $this->endDate ) ) {
+                               $this->error( "Invalid enddate format.\n", 1 );
                        }
                } else {
-                       $endDate = wfTimestampNow();
+                       $this->endDate = wfTimestampNow();
                }
+
+               # Get a single DB_MASTER connection
+               $this->dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 
'db', $wgDBname ) );
+
+               # Check if wiki supports page counters (removed from core in 
1.25)
+               $this->supportsCounters = $this->dbw->fieldExists( 'page', 
'page_counter', __METHOD__ );
+
+               # Get last text id
+               $this->lastTextId = (int)$this->dbw->selectField(
+                       'text',
+                       'old_id',
+                       array(),
+                       __METHOD__,
+                       array( 'ORDER BY' => 'old_id DESC' )
+               );
 
                # bot class and log in if requested
                if ( $user && $password ) {
-                       $bot = new MediaWikiBot(
+                       $this->bot = new MediaWikiBot(
                                $url,
                                'json',
                                $user,
                                $password,
                                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:13.0) 
Gecko/20100101 Firefox/13.0.1'
                        );
-                       if ( !$bot->login() ) {
+                       if ( !$this->bot->login() ) {
                                $this->output( "Logged in as $user...\n" );
                        } else {
-                               $this->output( "Warning - failed to log in as 
$user.\n" );
+                               $this->error( "Failed to log in as $user.\n", 1 
);
                        }
                } else {
-                       $bot = new MediaWikiBot(
+                       $this->bot = new MediaWikiBot(
                                $url,
                                'json',
                                '',
@@ -73,8 +129,6 @@
                        );
                }
 
-               $skipped = array();
-               $pageList = array();
                $this->output( "\n" );
 
                # Get all pages as a list, start by getting namespace numbers...
@@ -84,177 +138,248 @@
                        'meta' => 'siteinfo',
                        'siprop' => 'namespaces|statistics|namespacealiases'
                );
-               $result = $bot->query( $params );
+               $result = $this->bot->query( $params );
                $siteinfo = $result['query'];
 
                # No data - bail out early
                if ( empty( $siteinfo ) ) {
-                       $this->error( 'No siteinfo data found...', true );
+                       $this->error( 'No siteinfo data found', 1 );
                }
 
-               $textNamespaces = array();
-               foreach ( array_keys( $siteinfo['namespaces'] ) as $ns ) {
-                       # Ignore special and weird Wikia namespaces
-                       if ( $ns  < 0 || $ns >= 400 ) {
-                               continue;
+               if ( $this->hasOption( 'namespaces' ) ) {
+                       $textNamespaces = explode( '|', $this->getOption( 
'namespaces', '' ) );
+                       $grabFromAllNamespaces = false;
+               } else {
+                       $grabFromAllNamespaces = true;
+                       foreach ( array_keys( $siteinfo['namespaces'] ) as $ns 
) {
+                               # Ignore special
+                               if ( $ns >= 0 ) {
+                                       $textNamespaces[] = $ns;
+                               }
                        }
-                       $textNamespaces[] = $ns;
                }
                if ( !$textNamespaces ) {
-                       $this->error( 'Got no namespaces...', true );
+                       $this->error( 'Got no namespaces', 1 );
                }
 
-               # Get list of live pages from namespaces and continue from there
-               $pageCount = $siteinfo['statistics']['pages'];
+               if ( $grabFromAllNamespaces ) {
+                       # Get list of live pages from namespaces and continue 
from there
+                       $pageCount = $siteinfo['statistics']['pages'];
+                       $this->output( "Generating page list from all 
namespaces - $pageCount expected...\n" );
+               } else {
+                       $this->output( sprintf( "Generating page list from %s 
namespaces...\n", count( $textNamespaces ) ) );
+               }
 
-               $this->output( "Generating page list - $pageCount 
expected...\n" );
+               $start = $this->getOption( 'start' );
+               if ( $start ) {
+                       $title = Title::newFromText( $start );
+                       if ( is_null( $title ) ) {
+                               $this->error( 'Invalid title provided for the 
start parameter', 1 );
+                       }
+                       $this->output( sprintf( "Trying to resume import from 
page %s\n", $title ) );
+               }
+
                $pageCount = 0;
-               $doneCount = 0;
 
                foreach ( $textNamespaces as $ns ) {
-                       $nsPageCount = 0;
-                       $more = true;
-                       $gapfrom = null;
-                       $params = array(
-                               'generator' => 'allpages',
-                               'gaplimit' => 'max',
-                               'prop' => 'info',
-                               'inprop' => 'protection',
-                               'gapnamespace' => $ns
-                       );
-                       do {
-                               # Note - 'gapfrom' became 'gapcontinue' in 
mw1.20, though the former is still supported.
-                               if ( $gapfrom === null ) {
-                                       unset( $params['gapfrom'] );
+                       $continueTitle = null;
+                       if ( isset( $title ) && ! is_null( $title ) ) {
+                               if ( $title->getNamespace() === $ns ) {
+                                       $continueTitle = 
$title->getPrefixedText();
+                                       $title = null;
                                } else {
-                                       $params['gapfrom'] = $gapfrom;
+                                       continue;
                                }
-                               $result = $bot->query( $params );
+                       }
+                       $pageCount += $this->processPagesFromNamespace( $ns, 
$continueTitle );
+               }
+               $this->output( "\nDone - found $pageCount total pages.\n" );
+               # Done.
+       }
 
-                               # Skip empty namespaces
-                               if ( isset( $result['query'] ) ) {
-                                       $pages = $result['query']['pages'];
+       /**
+        * Grabs all pages from a given namespace
+        *
+        * @param int $ns Namespace to process.
+        * @param string $continueTitle Title to start from (optional).
+        * @return int Number of pages processed.
+        */
+       function processPagesFromNamespace( $ns, $continueTitle = null ) {
+               $this->output( "Processing pages from namespace $ns...\n" );
+               $doneCount = 0;
+               $nsPageCount = 0;
+               $more = true;
+               $params = array(
+                       'generator' => 'allpages',
+                       'gaplimit' => 'max',
+                       'prop' => 'info',
+                       'inprop' => 'protection',
+                       'gapnamespace' => $ns
+               );
+               if ( $continueTitle ) {
+                       $params['gapfrom'] = $continueTitle;
+               }
+               do {
+                       $result = $this->bot->query( $params );
 
-                                       $resultsCount = 0;
-                                       foreach ( $pages as $page ) {
-                                               $this->processPage( $page );
-                                               $doneCount++;
-                                               if ( $doneCount % 500 === 0 ) {
-                                                       $this->output( 
"$doneCount\n" );
-                                               }
-                                               $resultsCount++;
+                       # Skip empty namespaces
+                       if ( isset( $result['query'] ) ) {
+                               $pages = $result['query']['pages'];
+
+                               $resultsCount = 0;
+                               foreach ( $pages as $page ) {
+                                       $this->processPage( $page );
+                                       $doneCount++;
+                                       if ( $doneCount % 500 === 0 ) {
+                                               $this->output( "$doneCount\n" );
                                        }
-                                       $nsPageCount += $resultsCount;
+                                       $resultsCount++;
+                               }
+                               $nsPageCount += $resultsCount;
 
-                                       # Try mw1.20+ version and fall back to 
old gapfrom if it fails.
-                                       if ( isset( $result['query-continue'] ) 
) {
-                                               if ( isset( 
$result['query-continue']['allpages']['gapcontinue'] ) ) {
-                                                       $gapfrom = 
$result['query-continue']['allpages']['gapcontinue'];
-                                               } else {
-                                                       $gapfrom = 
$result['query-continue']['allpages']['gapfrom'];
-                                               }
-                                       } else {
-                                               $gapfrom = null;
-                                       }
-                                       $more = !( $gapfrom === null );
+                               if ( isset( $result['query-continue'] ) && 
isset( $result['query-continue']['allpages'] ) ) {
+                                       # Add continuation parameters
+                                       $params = array_merge( $params, 
$result['query-continue']['allpages'] );
                                } else {
                                        $more = false;
                                }
-                       } while ( $more );
+                       } else {
+                               $more = false;
+                       }
+               } while ( $more );
 
-                       $this->output( "$nsPageCount pages found in namespace 
$ns.\n" );
-                       $pageCount += $nsPageCount;
-               }
-               $this->output( "\nDone - found $pageCount total pages.\n" );
+               $this->output( "$nsPageCount pages found in namespace $ns.\n" );
 
-               # Print skipped list
-               $this->output( "\nPage IDs skipped (not found):" );
-               foreach ( $skipped as $pageID ) {
-                       $this->output( "$pageID\n" );
-               }
-
-               $this->output( "\n" );
-               # Done.
+               return $nsPageCount;
        }
 
        /**
         * Handle an individual page.
         *
-        * @param array $page Array retrieved from the API, containing pageid,
-        *                     page title, namespace, protection status and 
more...
-        * @param int $start Timestamp from which to get revisions; if this is
-        *                     defined, protection stuff is skipped.
+        * @param array $page: Array retrieved from the API, containing pageid,
+        *     page title, namespace, protection status and more...
         */
-       function processPage( $page, $start = null ) {
-               global $wgDBname, $bot, $endDate, $carlb, $skipped;
+       function processPage( $page ) {
+               global $wgContentHandlerUseDB;
 
                $pageID = $page['pageid'];
-               $title = $page['title'];
-               $ns = $page['ns'];
-               $localID = $pageID;
-               $titleIsPresent = false;
 
-               $this->output( "Processing page $pageID: $title\n" );
+               $this->output( "Processing page id $pageID...\n" );
 
-               # Trim and convert displayed title to database page title
-               if ( $ns != 0 ) {
-                       $title = preg_replace( '/^[^:]*?:/', '', $title );
+               $params = array(
+                       'prop' => 'info|revisions',
+                       'rvlimit' => 'max',
+                       'rvprop' => 
'ids|flags|timestamp|user|userid|comment|content|tags',
+                       'rvdir' => 'newer',
+                       'rvend' => wfTimestamp( TS_ISO_8601, $this->endDate )
+               );
+               $params['pageids'] = $pageID;
+               if ( $page['protection'] ) {
+                       $params['inprop'] = 'protection';
                }
-               $title = str_replace( ' ', '_', $title );
+               if ( $wgContentHandlerUseDB ) {
+                       $params['rvprop'] = $params['rvprop'] . '|contentmodel';
+               }
 
-               $dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', 
$wgDBname ) );
-               if ( $start ) {
-                       # Check if title is present
-                       $dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 
'db', $wgDBname ) );
-                       $result = $dbr->select(
-                               'page',
-                               'page_id',
-                               array(
-                                       'page_namespace' => $ns,
-                                       'page_title' => $title
-                               ),
-                               __METHOD__
-                       );
-                       $row = $dbr->fetchObject( $result );
-                       if ( $row ) {
-                               $localID = $row->page_id;
-                               $titleIsPresent = true;
+               $result = $this->bot->query( $params );
 
-                       } else {
-                               # Check if id is present
-                               $result = $dbr->select(
-                                       'page',
-                                       'page_title',
-                                       array( 'page_id' => $pageID ),
-                                       __METHOD__
-                               );
-                               if ( $dbr->fetchObject( $result ) ) {
-                                       $resid = (int)$dbr->selectField(
-                                               'page',
-                                               'page_id',
-                                               array(),
-                                               __METHOD__,
-                                               array( 'ORDER BY' => 'page_id 
desc' )
-                                       );
-                                       $localID = $resid + 1;
-                               }
+               if ( ! $result || isset( $result['error'] ) ) {
+                       $this->error( "Error getting revision information from 
API for page id $pageID.", 1 );
+                       return;
+               }
+
+               if ( isset( $params['inprop'] ) ) {
+                       unset( $params['inprop'] );
+               }
+
+               $info_pages = array_values( $result['query']['pages'] );
+               if ( isset( $info_pages[0]['missing'] ) ) {
+                       $this->output( "Page id $pageID not found.\n" );
+                       return;
+               }
+
+               if ( !$pageID ) {
+                       $pageID = $info_pages[0]['pageid'];
+               }
+
+               $page_e = array(
+                       'namespace' => null,
+                       'title' => null,
+                       'restrictions' => '',
+                       'counter' => 0,
+                       'is_redirect' => 0,
+                       'is_new' => 0,
+                       'random' => wfRandom(),
+                       'touched' => wfTimestampNow(),
+                       'len' => 0,
+                       'content_model' => null
+               );
+               # Trim and convert displayed title to database page title
+               # Get it from the returned value from api
+               $page_e['namespace'] = $info_pages[0]['ns'];
+               $page_e['title'] = $this->sanitiseTitle( $info_pages[0]['ns'], 
$info_pages[0]['title'] );
+
+               # Get other information from api info
+               $page_e['is_redirect'] = ( isset( $info_pages[0]['redirect'] ) 
? 1 : 0 );
+               $page_e['is_new'] = ( isset( $info_pages[0]['new'] ) ? 1 : 0 );
+               $page_e['len'] = $info_pages[0]['length'];
+               $page_e['counter'] = ( isset( $info_pages[0]['counter'] ) ? 
$info_pages[0]['counter'] : 0 );
+               $page_e['latest'] = $info_pages[0]['lastrevid'];
+               $defaultModel = null;
+               if ( $wgContentHandlerUseDB && isset( 
$info_pages[0]['contentmodel'] ) ) {
+                       # This would be the most accurate way of getting the 
content model for a page.
+                       # However it calls hooks and can be incredibly slow or 
cause errors
+                       #$defaultModel = ContentHandler::getDefaultModelFor( 
Title:makeTitle( $page_e['namespace'], $page_e['title'] ) );
+                       $defaultModel = MWNamespace::getNamespaceContentModel( 
$info_pages[0]['ns'] ) || CONTENT_MODEL_WIKITEXT;
+                       # Set only if not the default content model
+                       if ( $defaultModel != $info_pages[0]['contentmodel'] ) {
+                               $page_e['content_model'] = 
$info_pages[0]['contentmodel'];
                        }
                }
 
-               # Update page_restrictions
-               # NOTE - this doesn't support if the protections are already 
there, just adds blindly
-               if ( !$start && $page['protection'] ) {
-                       foreach ( $page['protection'] as $prot ) {
+               # Check if page is present
+               $pageIsPresent = false;
+               $rowCount = $this->dbw->selectRowCount(
+                       'page',
+                       'page_id',
+                       array( 'page_id' => $pageID ),
+                       __METHOD__
+               );
+               if ( $rowCount ) {
+                       $pageIsPresent = true;
+               }
+
+               # If page is not present, check if title is present, because we 
can't insert
+               # a duplicate title. That would mean the page was moved leaving 
a redirect but
+               # we haven't processed the move yet
+               if ( ! $pageIsPresent ) {
+                       $conflictingPageID = $this->getPageID( 
$page_e['namespace'], $page_e['title'] );
+                       if ( $conflictingPageID ) {
+                               # Whoops...
+                               $this->resolveConflictingTitle( 
$conflictingPageID, $page_e['namespace'], $page_e['title'] );
+                       }
+               }
+
+               # Update page_restrictions (only if requested)
+               if ( isset( $info_pages[0]['protection'] ) ) {
+                       $this->output( "Setting page_restrictions changes on 
page_id $pageID.\n" );
+                       # Delete first any existing protection
+                       $this->dbw->delete(
+                               'page_restrictions',
+                               array( 'pr_page' => $pageID ),
+                               __METHOD__
+                       );
+                       # insert current restrictions
+                       foreach ( $info_pages[0]['protection'] as $prot ) {
                                $e = array(
-                                       'page' => $localID,
+                                       'page' => $pageID,
                                        'type' => $prot['type'],
                                        'level' => $prot['level'],
                                        'cascade' => 0,
                                        'user' => null,
-                                       'expiry' => ( $prot['expiry'] == 
'infinity' ? 'infinity' : wfTimestamp( TS_MW, $prot['expiry'] ) ),
-                                       'id' => null
+                                       'expiry' => ( $prot['expiry'] == 
'infinity' ? 'infinity' : wfTimestamp( TS_MW, $prot['expiry'] ) )
                                );
-                               $dbw->insert(
+                               $this->dbw->insert(
                                        'page_restrictions',
                                        array(
                                                'pr_page' => $e['page'],
@@ -262,190 +387,100 @@
                                                'pr_level' => $e['level'],
                                                'pr_cascade' => $e['cascade'],
                                                'pr_user' => $e['user'],
-                                               'pr_expiry' => $e['expiry'],
-                                               'pr_id' => $e['id'],
+                                               'pr_expiry' => $e['expiry']
                                        ),
                                        __METHOD__
                                );
-                               $dbw->commit();
-                               # $this->output( "Committed page_restrictions 
changes.\n" );
                        }
                }
 
-               $page_e = array(
-                       'id' => $localID,
-                       'namespace' => $ns,
-                       'title' => $title,
-                       'restrictions' => '',
-                       'counter' => 0,
-                       'is_redirect' => ( isset( $page['redirect'] ) ? 1 : 0 ),
-                       'is_new' => 0,
-                       'random' => wfRandom(),
-                       'touched' => wfTimestampNow(),
-                       'len' => $page['length'],
-               );
+               $revisionsProcessed = false;
+               while ( true ) {
+                       foreach ( $info_pages[0]['revisions'] as $revision ) {
+                               $revisionsProcessed = $this->processRevision( 
$revision, $pageID, $defaultModel ) || $revisionsProcessed;
+                       }
 
-               # Retrieving the list of revisions, including text.
-               $revision_latest;
-               $last_rev_id = 0;
-               $more = true;
-               $rvcontinue = null;
-               # 'rvcontinue' in 1.20+, 'rvstartid' in 1.19-
-               $rvcontinuename = 'rvcontinue';
+                       if ( isset( $result['query-continue'] ) && isset( 
$result['query-continue']['revisions'] ) ) {
+                               # Add continuation parameters
+                               $params = array_merge( $params, 
$result['query-continue']['revisions'] );
+                       } else {
+                               break;
+                       }
 
-               if ( $carlb ) {
-                       $rvmax = 10;
-               } else {
-                       $rvmax = 'max';
+                       $result = $this->bot->query( $params );
+                       if ( ! $result || isset( $result['error'] ) ) {
+                               $this->error( "Error getting revision 
information from API for page id $pageID.", 1 );
+                               return;
+                       }
+
+                       $info_pages = array_values( $result['query']['pages'] );
                }
 
-               $params = array(
-                       'prop' => 'revisions',
-                       'pageids' => $pageID,
-                       'rvlimit' => $rvmax,
-                       'rvprop' => 
'ids|flags|timestamp|user|userid|comment|content|tags',
-                       'rvdir' => 'newer',
-                       'rvend' => wfTimestamp( TS_ISO_8601, $endDate )
-               );
-               if ( $start ) {
-                       $params['rvstart'] = wfTimestamp( TS_ISO_8601, $start );
-               }
-               do {
-                       if ( $rvcontinue === null ) {
-                               unset( $params[$rvcontinuename] );
-                       } else {
-                               $params[$rvcontinuename] = $rvcontinue;
-                       }
-
-                       $result = $bot->query( $params );
-                       if ( isset( $result['query']['pages'] ) ) {
-                               $last_rev_info = $this->processPageResult( 
$result, $localID, $last_rev_id );
-                       } else {
-                               if ( $params['rvlimit'] == 1 ) {
-                                       $this->output( "Page id $pageID not 
found.\n" );
-                                       return;
-                               } else {
-                                       $params['rvlimit'] = 1;
-                                       $result = $bot->query( $params );
-                                       if ( isset( $result['query']['pages'] ) 
) {
-                                               $last_rev_info = 
$this->processPageResult( $result, $localID, $last_rev_id );
-                                       } else {
-                                               $this->output( "Page id $pageID 
not found.\n" );
-                                               $skipped[] = $pageID;
-                                               return;
-                                       }
-                               }
-                       }
-                       if ( isset( $result['query-continue'] ) ) {
-                               # Check name being used - if it's not the set 
one, reset it
-                               if ( !isset( 
$result['query-continue']['revisions'][$rvcontinuename] ) ) {
-                                       $rvcontinuename = 'rvstartid';
-                               }
-                               $rvcontinue = 
$result['query-continue']['revisions'][$rvcontinuename];
-                       } else {
-                               $rvcontinue = null;
-                       }
-                       $more = !( $rvcontinue === null );
-               } while ( $more );
-
-               if ( !$last_rev_info ) {
-                       # Dupe.
+               if ( !$revisionsProcessed ) {
+                       # We already processed the page before? page doesn't 
need updating, then
                        return;
                }
 
-               $page_e['latest'] = $last_rev_info[0];
-               $page_e['len'] = $last_rev_info[1];
-               if ( !$start ) {
-                       $dbw->insert(
+               $insert_fields = array(
+                       'page_namespace' => $page_e['namespace'],
+                       'page_title' => $page_e['title'],
+                       'page_restrictions' => $page_e['restrictions'],
+                       'page_is_redirect' => $page_e['is_redirect'],
+                       'page_is_new' => $page_e['is_new'],
+                       'page_random' => $page_e['random'],
+                       'page_touched' => $page_e['touched'],
+                       'page_latest' => $page_e['latest'],
+                       'page_len' => $page_e['len'],
+                       'page_content_model' => $page_e['content_model']
+               );
+               if ( $this->supportsCounters && $page_e['counter'] ) {
+                       $insert_fields['page_counter'] = $page_e['counter'];
+               }
+               if ( ! $pageIsPresent ) {
+                       # insert if not present
+                       $this->output( "Inserting page entry $pageID\n" );
+                       $insert_fields['page_id'] = $pageID;
+                       $this->dbw->insert(
                                'page',
-                               array(
-                                       'page_id' => $page_e['id'],
-                                       'page_namespace' => 
$page_e['namespace'],
-                                       'page_title' => $page_e['title'],
-                                       'page_restrictions' => 
$page_e['restrictions'],
-                                       'page_counter' => $page_e['counter'],
-                                       'page_is_redirect' => 
$page_e['is_redirect'],
-                                       'page_is_new' => $page_e['is_new'],
-                                       'page_random' => $page_e['random'],
-                                       'page_touched' => $page_e['touched'],
-                                       'page_latest' => $page_e['latest'],
-                                       'page_len' => $page_e['len']
-                               ),
+                               $insert_fields,
                                __METHOD__
                        );
                } else {
-                       # update or insert if not present
-                       if ( $titleIsPresent ) {
-                               $this->output( "Updating page entry $localID\n" 
);
-                               $dbw->update(
-                                       'page',
-                                       array(
-                                               'page_namespace' => 
$page_e['namespace'],
-                                               'page_title' => 
$page_e['title'],
-                                               'page_restrictions' => 
$page_e['restrictions'],
-                                               'page_counter' => 
$page_e['counter'],
-                                               'page_is_redirect' => 
$page_e['is_redirect'],
-                                               'page_is_new' => 
$page_e['is_new'],
-                                               'page_random' => 
$page_e['random'],
-                                               'page_touched' => 
$page_e['touched'],
-                                               'page_latest' => 
$page_e['latest'],
-                                               'page_len' => $page_e['len']
-                                       ),
-                                       array( 'page_id' => $localID ),
-                                       __METHOD__
-                               );
-                       } else {
-
-                               $this->output( "Inserting page entry 
$localID\n" );
-                               $dbw->insert(
-                                       'page',
-                                       array(
-                                               'page_id' => $localID,
-                                               'page_namespace' => 
$page_e['namespace'],
-                                               'page_title' => 
$page_e['title'],
-                                               'page_restrictions' => 
$page_e['restrictions'],
-                                               'page_counter' => 
$page_e['counter'],
-                                               'page_is_redirect' => 
$page_e['is_redirect'],
-                                               'page_is_new' => 
$page_e['is_new'],
-                                               'page_random' => 
$page_e['random'],
-                                               'page_touched' => 
$page_e['touched'],
-                                               'page_latest' => 
$page_e['latest'],
-                                               'page_len' => $page_e['len']
-                                       ),
-                                       __METHOD__
-                               );
-                       }
+                       # update existing
+                       $this->output( "Updating page entry $pageID\n" );
+                       $this->dbw->update(
+                               'page',
+                               $insert_fields,
+                               array( 'page_id' => $pageID ),
+                               __METHOD__
+                       );
                }
-               $dbw->commit();
-       }
-
-       /**
-        * Take the result from revision request and call processRevision
-        */
-       function processPageResult( $result, $localID, $last_rev_id ) {
-               $revisions = array_values( $result['query']['pages'] );
-               $revisions = $revisions[0]['revisions'];
-
-               foreach ( $revisions as $revision ) {
-                       $last_rev_info = $this->processRevision( $revision, 
$localID, $last_rev_id );
-               }
-               return $last_rev_info;
+               $this->dbw->commit();
        }
 
        /**
         * Process an individual page revision.
         *
         * @param array $revision Array retrieved from the API, containing the 
revision
-        *                    text, ID, timestamp, whether it was a minor edit 
or
-        *                    not and much more
-        * @param int $page_id Page ID
-        * @param int $prev_rev_id Previous revision ID (revision.rev_parent_id)
+        *     text, ID, timestamp, whether it was a minor edit or not and much 
more
+        * @param int $page_id Page ID number of the revision we are going to 
insert
+        * @param string $defaultModel Default content model for this page
+        * @return bool Whether revision has been inserted or not
         */
-       function processRevision( $revision, $page_id, $prev_rev_id ) {
-               global $wgLang, $wgDBname, $lastRevision;
+       function processRevision( $revision, $page_id, $defaultModel ) {
+               global $wgLang, $wgContentHandlerUseDB;
+               $revid = $revision['revid'];
 
-               if ( $revision['revid'] <= $lastRevision ) {
-                       # Oops? Too recent.
+               # Workaround check if it's already there.
+               $rowCount = $this->dbw->selectRowCount(
+                       'revision',
+                       'rev_id',
+                       array( 'rev_id' => $revid ),
+                       __METHOD__
+               );
+               if ( $rowCount ) {
+                       # Already in database
+                       $this->output( "Revision $revid is already in the 
database. Skipped.\n" );
                        return false;
                }
 
@@ -453,145 +488,179 @@
                # and sets bitfield thingy
                $revdeleted = 0;
                if ( isset( $revision['userhidden'] ) ) {
-                       $revdeleted = $revdeleted | 4;
-                       $revision['user'] = 'username removed';
-                       $revision['userid'] = 0;
-               }
-               if ( isset( $revision['commenthidden'] ) ) {
-                       $revdeleted = $revdeleted | 2;
-                       $revision['comment'] = 'edit summary removed';
-               }
-               if ( isset( $revision['texthidden'] ) ) {
-                       $revdeleted = $revdeleted | 1;
-                       $revision['*'] = 'This content has been removed.';
-               }
-
-               # Workaround check if it's already there; disabled for now
-               if ( false ) {
-                       $dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 
'db', $wgDBname ) );
-                       $result = $dbr->select(
-                               'revision',
-                               'rev_page',
-                               array( 'rev_id' => $revision['revid'] ),
-                               __METHOD__
-                       );
-                       if ( $dbr->fetchObject( $result ) ) {
-                               # Already in database
-                               return false;
+                       $revdeleted = $revdeleted | Revision::DELETED_USER;
+                       if ( !isset( $revision['user'] )) {
+                               $revision['user'] = ''; # username removed
+                       }
+                       if ( !isset( $revision['userid'] )) {
+                               $revision['userid'] = 0;
                        }
                }
-
-               $text = $revision['*'];
-               $comment = $revision['comment'];
-               if ( $comment ) {
-                       $comment = $wgLang->truncate( $comment, 255 );
+               if ( isset( $revision['commenthidden'] ) ) {
+                       $revdeleted = $revdeleted | Revision::DELETED_COMMENT;
+                       $comment = ''; # edit summary removed
                } else {
-                       $comment = '';
+                       $comment = $revision['comment'];
+                       if ( $comment ) {
+                               $comment = $wgLang->truncate( $comment, 255 );
+                       } else {
+                               $comment = '';
+                       }
                }
-               $tags = $revision['tags'];
+               if ( isset( $revision['texthidden'] ) ) {
+                       $revdeleted = $revdeleted | Revision::DELETED_TEXT;
+                       $text = ''; # This content has been removed.
+               } else {
+                       $text = $revision['*'];
+               }
+               if ( isset ( $revision['suppressed'] ) ) {
+                       $revdeleted = $revdeleted | 
Revision::DELETED_RESTRICTED;
+               }
 
                $e = array(
-                       'id' => $revision['revid'],
-                       'parent_id' => $revision['parentid'],
+                       'id' => $revid,
                        'page' => $page_id,
-                       'text_id' => $this->storeText( $text ),
                        'comment' => $comment,
                        'user' => $revision['userid'], # May not be accurate to 
the new wiki, obvious, but whatever.
                        'user_text' => $revision['user'],
                        'timestamp' => wfTimestamp( TS_MW, 
$revision['timestamp'] ),
-                       'minor_edit' => ( isset( $reisionv['minor'] ) ? 1 : 0 ),
+                       'minor_edit' => ( isset( $revision['minor'] ) ? 1 : 0 ),
                        'deleted' => $revdeleted,
                        'len' => strlen( $text ),
-                       'parent_id' => ( $prev_rev_id || 0 )
+                       'parent_id' => $revision['parentid'],
+                       # Do not attempt to get the field from api, because 
it's not what
+                       # you'd expect. See T75411
+                       'sha1' => Revision::base36Sha1( $text ),
+                       'content_model' => null,
+                       'content_format' => null
                );
 
-               $dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', 
$wgDBname ) );
-               # insert revisions
-               $dbw->insert(
+               $e['text_id'] = $this->storeText( $text, $e['sha1'], $page_id, 
$revid );
+
+               # Set content model
+               if ( $wgContentHandlerUseDB && isset( $revision['contentmodel'] 
) ) {
+                       # Set only if not the default content model
+                       if ( $defaultModel != $revision['contentmodel'] ) {
+                               $e['content_model'] = $revision['contentmodel'];
+                               $defaultFormat = ContentHandler::getForModelID( 
$defaultModel )->getDefaultFormat();
+                               if ( $defaultFormat != 
$revision['contentformat'] ) {
+                                       $e['content_format'] = 
$revision['contentformat'];
+                               }
+                       }
+               }
+
+               $insert_fields = array(
+                       'rev_id' => $e['id'],
+                       'rev_page' => $e['page'],
+                       'rev_text_id' => $e['text_id'],
+                       'rev_comment' => $e['comment'],
+                       'rev_user' => $e['user'],
+                       'rev_user_text' => $e['user_text'],
+                       'rev_timestamp' => $e['timestamp'],
+                       'rev_minor_edit' => $e['minor_edit'],
+                       'rev_deleted' => $e['deleted'],
+                       'rev_len' => $e['len'],
+                       'rev_parent_id' => $e['parent_id'],
+                       'rev_sha1' => $e['sha1'],
+                       'rev_content_model' => $e['content_model'],
+                       'rev_content_format' => $e['content_format'],
+               );
+
+               $this->output( sprintf( "Inserting revision %s\n", $e['id'] ) );
+               $this->dbw->insert(
                        'revision',
-                       array(
-                               'rev_id' => $e['id'],
-                               'rev_parent_id' => $e['parentid'],
-                               'rev_page' => $e['page'],
-                               'rev_text_id' => $e['text_id'],
-                               'rev_comment' => $e['comment'],
-                               'rev_user' => $e['user'],
-                               'rev_user_text' => $e['user_text'],
-                               'rev_timestamp' => $e['timestamp'],
-                               'rev_minor_edit' => $e['minor_edit'],
-                               'rev_deleted' => $e['deleted'],
-                               'rev_len' => $e['len'],
-                               'rev_parent_id' => $e['parent_id'],
-                       ),
+                       $insert_fields,
                        __METHOD__
                );
 
                # Insert tags, if any
-               if ( count( $tags ) ) {
-                       $tagBlob = '';
-                       foreach ( $tags as $tag ) {
-                               $dbw->insert(
-                                       'change_tags',
+               if ( isset( $revision['tags'] ) && count( $revision['tags'] ) > 
0 ) {
+                       foreach ( $revision['tags'] as $tag ) {
+                               $this->dbw->insert(
+                                       'change_tag',
                                        array(
                                                'ct_rev_id' => $e['id'],
                                                'ct_tag' => $tag,
                                        ),
                                        __METHOD__
                                );
-                               if ( $tagBlob == '' ) {
-                                       $tagBlob = $tag;
-                               } else {
-                                       $tagBlob = "$tagBlob, $tag";
-                               }
                        }
-                       $dbw->insert(
+                       $this->dbw->insert(
                                'tag_summary',
                                array(
                                        'ts_rev_id' => $e['id'],
-                                       'ts_tags' => $tagBlob,
+                                       'ts_tags' => implode( ',', 
$revision['tags'] ),
                                ),
                                __METHOD__
                        );
                }
-               $dbw->commit();
 
-               return array( $revision['revid'], $e['len'] );
+               $this->dbw->commit();
+
+               return true;
        }
 
-       # Stores revision texts in the text table.
-       function storeText( $text ) {
-               global $current_text_id, $wgDBname;
+       /**
+        * Stores revision text in the text table. If the page ID is provided 
and
+        * a revision exists with the same text, it will reuse it instead of
+        * creating a duplicate entry in text table.
+        * If configured, stores text in external storage
+        *
+        * @param string $text Text of the revision to store
+        * @param string $sha1 computed sha1 of the text
+        * @param int $pageID page id of the revision, used to return the
+        *            previous revision text if it's the same (optional)
+        * @param int $revisionID revision id (optional)
+        * @return int text id of the inserted text
+        */
+       function storeText( $text, $sha1, $pageID = 0, $revisionID = 0 ) {
+               global $wgDefaultExternalStore;
 
-               if ( !isset( $current_text_id ) ) {
-                       $dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 
'db', $wgDBname ) );
-                       $result = $dbr->select(
-                               'text',
-                               'old_id',
-                               '',
+               if ( $pageID ) {
+                       # Check first if the text already exists on any 
revision of the current page,
+                       # to reuse text rows on page moves, protections, etc
+                       # Return the previous revision from that page
+                       $row = $this->dbw->selectRow(
+                               array( 'revision' ),
+                               array( 'rev_id', 'rev_sha1', 'rev_text_id' ),
+                               "rev_page = $pageID AND rev_id <= $revisionID",
                                __METHOD__,
                                array(
                                        'LIMIT' => 1,
-                                       'ORDER BY' => '`text`.`old_id` DESC'
+                                       'ORDER BY' => 'rev_id DESC'
                                )
                        );
-                       $row = $dbr->fetchObject( $result );
-                       if ( $row ) {
-                               $current_text_id = $row->old_id;
-                       } else {
-                               $current_text_id = 0;
+
+                       if ( $row && $row->rev_sha1 == $sha1 ) {
+                               # Return the existing text id instead of 
creating a new one
+                               return $row->rev_text_id;
                        }
-                       $dbr->freeResult( $result );
                }
-               $current_text_id++;
+
+               $this->lastTextId++;
+
+               $flags = Revision::compressRevisionText( $text );
+
+               # Write to external storage if required
+               if ( $wgDefaultExternalStore ) {
+                       # Store and get the URL
+                       $text = ExternalStore::insertToDefault( $text );
+                       if ( !$text ) {
+                               throw new MWException( "Unable to store text to 
external storage" );
+                       }
+                       if ( $flags ) {
+                               $flags .= ',';
+                       }
+                       $flags .= 'external';
+               }
 
                $e = array(
-                       'id' => $current_text_id,
+                       'id' => $this->lastTextId,
                        'text' => $text,
-                       'flags' => ''
+                       'flags' => $flags
                );
 
-               $dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', 
$wgDBname ) );
-               $dbw->insert(
+               $this->dbw->insert(
                        'text',
                        array(
                                'old_id' => $e['id'],
@@ -601,9 +670,139 @@
                        __METHOD__
                );
 
-               return $current_text_id;
+               return $e['id'];
        }
 
+       /**
+        * Fixes a situation where we have the same title on local and remote 
wiki
+        * but with different page ID. The fix is to get the title for the local
+        * page ID on the remote wiki.
+        * If local page id doesn't exist on remote, delete (and archive) local 
page
+        * since it must have been deleted. If it exists (in this case with 
different
+        * title) then move it to where it belongs
+        *
+        * @param int $conflictingPageID page ID with different title on local
+        *     and remote wiki
+        * @param int $remoteNs Namespace number of remote title for page id
+        * @param string $remoteTitle remote title for page id
+        * @param int $initialConflict optional - original conflicting ID to 
avoid
+        *     endless loops if pages were moved in round
+        * @return object A page object retrieved from database if an endless 
loop is
+        *     detected, used internally on recursive calls
+        */
+       function resolveConflictingTitle( $conflictingPageID, $remoteNs, 
$remoteTitle, $initialConflict = 0 ) {
+               $pageObj = null;
+               $pageTitle = Title::makeTitle( $remoteNs, $remoteTitle );
+               $this->output( "Warning: remote page ID $conflictingPageID has 
conflicting title $pageTitle with existing local page ID $conflictingPageID. 
Attempting to fix it...\n" );
+               if ( ! in_array( (string)$pageTitle, $this->movedTitles ) ) {
+                       $this->movedTitles[] = (string)$pageTitle;
+               }
+
+               # Get current title of the existing local page ID and move it 
to where it belongs
+               $params = array(
+                       'prop' => 'info',
+                       'pageids' => $conflictingPageID
+               );
+               $result = $this->bot->query( $params );
+               $info_pages = array_values( $result['query']['pages'] );
+
+               # First call to resolveConflictingTitle won't enter here, but 
on further recursive calls
+               if ( isset( $info_pages[0]['missing'] ) ) {
+                       $this->output( "Page ID $conflictingPageID not found on 
remote wiki. Deleting...\n" );
+                       # Delete our copy, move revisions to archive
+                       # NOTE: If page was moved on remote wiki before 
deleting, we may potentially
+                       # leave revisions in archive with wrong title.
+                       $this->archiveAndDeletePage( $conflictingPageID, 
$remoteNs, $remoteTitle );
+               } else {
+                       # Move page, but check first that the target title 
doesn't exist on local to avoid a conflict
+                       $resultingNs = $info_pages[0]['ns'];
+                       $resultingTitle = $this->sanitiseTitle( 
$info_pages[0]['ns'], $info_pages[0]['title'] );
+                       $resultingPageID = $this->getPageID( $resultingNs, 
$resultingTitle );
+                       $resultingPageTitle = Title::makeTitle( $resultingNs, 
$resultingTitle );
+                       if ( ! in_array( (string)$resultingPageTitle, 
$this->movedTitles ) ) {
+                               $this->movedTitles[] = 
(string)$resultingPageTitle;
+                       }
+
+                       if ( $resultingPageID ) {
+
+                               if ( $initialConflict == $resultingPageID ) {
+                                       # This should never happen, unless we 
move A->B, C->A, B->C
+                                       # In this case, we can't just rename, 
because it will blatantly violate the unique key for title
+                                       # Get the page information, delete it 
from DB and restore it after the move
+                                       $this->output( "Endless loop detected! 
Storing page ID $resultingPageID for later restore.\n" );
+                                       $pageObj = (array)$this->dbw->selectRow(
+                                               'page',
+                                               '*',
+                                               array( 'page_id' => 
$resultingPageID ),
+                                               __METHOD__
+                                       );
+                                       $this->dbw->delete(
+                                               'page',
+                                               array( 'page_id' => 
$resultingPageID ),
+                                               __METHOD__
+                                       );
+                               } else {
+                                       # Whoops! resulting title already 
exists locally, here we go again...
+                                       $pageObj = 
$this->resolveConflictingTitle( $resultingPageID, $resultingNs, 
$resultingTitle, $conflictingPageID );
+                               }
+
+                               if ( $pageObj && $initialConflict === 0 ) {
+                                       # Once we're resolved all conflicts, if 
we returend a $pageObj and we're on the originall call,
+                                       # restore the deleted page entry, with 
the correct page ID.
+                                       $this->output( sprintf( "Restoring page 
ID %s at title %s.\n",
+                                               $pageObj['page_id'], 
$resultingPageTitle ) );
+                                       $pageObj['page_namespace'] = 
$resultingNs;
+                                       $pageObj['page_title'] = 
$resultingTitle;
+                                       $this->dbw->insert(
+                                               'page',
+                                               $pageObj,
+                                               __METHOD__
+                                       );
+                                       # We've restored the page fixing the 
title, nothing more to do!
+                                       return null;
+                               }
+
+                       }
+                       $this->output( "Moving page ID $conflictingPageID to 
$resultingPageTitle...\n" );
+                       $this->dbw->update(
+                               'page',
+                               array(
+                                       'page_namespace' => $resultingNs,
+                                       'page_title' => $resultingTitle,
+                               ),
+                               array( 'page_id' => $conflictingPageID ),
+                               __METHOD__
+                       );
+               }
+               return $pageObj;
+       }
+
+       /**
+        * For use with deleted crap that chucks the id; spotty at best.
+        *
+        * @param int $ns Namespace number
+        * @param string $title Title of the page without the namespace
+        */
+       function getPageID( $ns, $title ) {
+               $pageID = (int)$this->dbw->selectField(
+                       'page',
+                       'page_id',
+                       array(
+                               'page_namespace' => $ns,
+                               'page_title' => $title,
+                       ),
+                       __METHOD__
+               );
+               return $pageID;
+       }
+
+       /**
+        * Strips the namespace from the title, if namespace number is 
different than 0,
+        *  and converts spaces to underscores. For use in database
+        *
+        * @param int $ns Namespace number
+        * @param string $title Title of the page with the namespace
+        */
        function sanitiseTitle( $ns, $title ) {
                if ( $ns != 0 ) {
                        $title = preg_replace( '/^[^:]*?:/', '', $title );
@@ -611,7 +810,6 @@
                $title = str_replace( ' ', '_', $title );
                return $title;
        }
-
 }
 
 $maintClass = 'GrabText';
diff --git a/mediawikibot.class.php b/mediawikibot.class.php
index 028e973..b5b6f38 100755
--- a/mediawikibot.class.php
+++ b/mediawikibot.class.php
@@ -104,6 +104,10 @@
                'rsd'
        );
 
+       /** Time in seconds to retry on failure before giving up
+        */
+       protected $retryTimes = array( 10, 30, 60, 120 );
+
        /** Constructor
         */
        public function __construct(
@@ -156,8 +160,10 @@
         *
         *  MediaWiki requires a dual login method to confirm authenticity. This
         *  entire method takes that into account.
+        * 
+        *  It returns null if success, or an array on failure
         */
-       public function login( $init = null ) {
+       public function login( $init = true ) {
                # build the url
                $url = $this->api_url( __FUNCTION__ );
                # build the params
@@ -167,8 +173,11 @@
                        'format' => 'php' # do not change this from php
                 );
                # get initial login info
-               if ( $init == null ) {
-                       $results = $this->login( true );
+               if ( $init ) {
+                       $results = $this->login( false );
+                       if ( ! isset( $results['login']['token'] ) ) {
+                               return $results;
+                       }
                        $results = ( array ) $results;
                } else {
                        $results = null;
@@ -180,8 +189,8 @@
                # get the data
                $data = $this->curl_post( $url, $params );
                # return or set data
-               if ( $data['login']['result'] != "Success" ) {
-                       return $data;
+               if ( ! is_array( $data ) && $data['login']['result'] != 
"Success" ) {
+                       return $data || [ 'Unknown error' ];
                }
        }
 
@@ -205,16 +214,15 @@
                # get the data
                $data = $this->curl_post( $url, $params, $multipart );
                # check data for grabbers; shut up loops are confusing it's too 
early.
-               if ( !isset( $data[$method] ) ) {
-                       echo "API error: no results; retrying in 5s\n";
-                       sleep( 5 );
-                       $data = $this->curl_post( $url, $params, $multipart );
-                       if ( !isset( $data[$method] ) ) {
-                               echo "API error: no results; retrying in 30s\n";
-                               sleep( 30 );
+               # Note: $data can be an empty array, resulting from api 
generators returning zero results
+               if ( $data === false ) {
+                       for ( $errors = 0; $errors < count( $this->retryTimes 
); $errors++) {
+                               $seconds = $this->retryTimes[$errors];
+                               echo "API error: no results; retrying in 
{$seconds}s\n";
+                               sleep( $seconds );
                                $data = $this->curl_post( $url, $params, 
$multipart );
-                               if ( !isset( $data[$method] ) ) {
-                                       echo "API error: no results found.\n";
+                               if ( $data !== false ) {
+                                       break;
                                }
                        }
                }
@@ -236,8 +244,9 @@
                # set the url, number of POST vars, POST data
                curl_setopt( $ch, CURLOPT_URL, $url );
                curl_setopt( $ch, CURLOPT_USERAGENT, USERAGENT );
-               curl_setopt( $ch, CURLOPT_RETURNTRANSFER,1 );
-               curl_setopt( $ch, CURLOPT_TIMEOUT, 15 );
+               curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 );
+               curl_setopt( $ch, CURLOPT_FAILONERROR, 1 );
+               curl_setopt( $ch, CURLOPT_TIMEOUT, 30 );
                curl_setopt( $ch, CURLOPT_COOKIEFILE, COOKIES );
                curl_setopt( $ch, CURLOPT_COOKIEJAR, COOKIES );
                curl_setopt( $ch, CURLOPT_POST, count( $params ) );
@@ -251,10 +260,14 @@
                }
                # execute the post
                $results = curl_exec( $ch );
+               $error = curl_errno( $ch );
+               if ( $error !== 0 ) {
+                       echo sprintf( "CURL ERROR: %s\n", curl_error( $ch ) );
+               }
                # close the connection
                curl_close( $ch );
                # return the unserialized results
-               return $this->format_results( $results, $params['format'] );
+               return $error !== 0 ? false : $this->format_results( $results, 
$params['format'] );
        }
 
        /** Check for multipart method

-- 
To view, visit https://gerrit.wikimedia.org/r/357224
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id8ad5a991823fcb19b4779eaaa6528f8cdc27a9b
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/tools/grabbers
Gerrit-Branch: master
Gerrit-Owner: Martineznovo <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to