[MediaWiki-commits] [Gerrit] [WIP] Dump Flow data - change (mediawiki...Flow)

Matthias Mullie (Code Review) Wed, 30 Sep 2015 07:41:42 -0700

Matthias Mullie has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/242569


Change subject: [WIP] Dump Flow data
......................................................................

[WIP] Dump Flow data

Bug: T89398
Change-Id: I52bc7c0ce7813a78f9006ca4b7d931a905726c05
---
M includes/Search/TopicUpdater.php
A maintenance/dumpBackup.php
2 files changed, 477 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Flow 
refs/changes/69/242569/1

diff --git a/includes/Search/TopicUpdater.php b/includes/Search/TopicUpdater.php
index 0002d28..81a6b96 100644
--- a/includes/Search/TopicUpdater.php
+++ b/includes/Search/TopicUpdater.php
@@ -73,7 +73,7 @@
         * Instead of querying for revisions (which is what we actually need), 
we'll
         * just query the workflow table, which will save us some complicated 
joins.
         * The workflow_id for a topic title (aka root post) is the same as its
-        * revision is, so we can pass that to the root post loader and *poof*, 
we
+        * revision id, so we can pass that to the root post loader and *poof*, 
we
         * have our revisions!
         *
         * {@inheritDoc}
diff --git a/maintenance/dumpBackup.php b/maintenance/dumpBackup.php
new file mode 100644
index 0000000..11cfa1a
--- /dev/null
+++ b/maintenance/dumpBackup.php
@@ -0,0 +1,476 @@
+<?php
+
+use Flow\Container;
+use Flow\Model\AbstractRevision;
+use Flow\Model\UUID;
+use Flow\Search\Updater;
+
+/**
+ * Script that dumps wiki pages or logging database into an XML interchange
+ * wrapper format for export or backup
+ *
+ * Copyright © 2005 Brion Vibber <br...@pobox.com>
+ * https://www.mediawiki.org/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Dump Maintenance
+ */
+
+$originalDir = getcwd();
+
+$optionsWithArgs = array( 'pagelist', 'start', 'end', 'revstart', 'revend' );
+
+$maintPath = ( getenv( 'MW_INSTALL_PATH' ) !== false
+       ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance'
+       : dirname( __FILE__ ) . '/../../../maintenance' );
+require_once $maintPath . '/commandLine.inc';
+require_once $maintPath . '/backup.inc';
+
+class FlowExporter extends WikiExporter {
+       public static function schemaVersion() {
+               return '1';
+       }
+
+       /**
+        * Generates the distinct list of authors of an article
+        * Not called by default (depends on $this->list_authors)
+        * Can be set by Special:Export when not exporting whole history
+        *
+        * @param array $cond
+        */
+       protected function do_list_authors( $cond ) {
+               // @todo: need this?
+
+               $this->author_list = "<contributors>";
+               // rev_deleted
+
+               $res = $this->db->select(
+                       array( 'page', 'revision' ),
+                       array( 'DISTINCT rev_user_text', 'rev_user' ),
+                       array(
+                               $this->db->bitAnd( 'rev_deleted', 
Revision::DELETED_USER ) . ' = 0',
+                               $cond,
+                               'page_id = rev_id',
+                       ),
+                       __METHOD__
+               );
+
+               foreach ( $res as $row ) {
+                       $this->author_list .= "<contributor>" .
+                               "<username>" .
+                               htmlentities( $row->rev_user_text ) .
+                               "</username>" .
+                               "<id>" .
+                               $row->rev_user .
+                               "</id>" .
+                               "</contributor>";
+               }
+               $this->author_list .= "</contributors>";
+       }
+
+       /**
+        * @param array|null $pages
+        * @param int|null $startId
+        * @param int|null $endId
+        * @param UUID|null $revStartId
+        * @param UUID|null $revEndId
+        * @throws Exception
+        * @throws TimestampException
+        * @throws \Flow\Exception\InvalidInputException
+        */
+       public function dump( array $pages = null, $startId = null, $endId = 
null, $revStartId = null, $revEndId = null ) {
+               /** @var Updater[] $updaters */
+               $updaters = Container::get( 'searchindex.updaters' );
+               foreach ( $updaters as $updaterType => $updater ) {
+                       while ( true ) {
+                               // fetch in batches
+                               $options = array( 'LIMIT' => 50 ); // @todo
+
+                               $conditions = $updater->buildQueryConditions( 
$revStartId, $revEndId, null );
+                               if ( $pages ) {
+                                       $conditions['workflow_page_id'] = 
$pages;
+                               }
+                               if ( $startId ) {
+                                       /** @var DatabaseBase $dbr */
+                                       $dbr = Container::get( 'db.factory' 
)->getDB( DB_SLAVE );
+                                       $conditions[] = 'workflow_page_id >= ' 
. $dbr->addQuotes( $startId );
+                               }
+                               if ( $endId ) {
+                                       /** @var DatabaseBase $dbr */
+                                       $dbr = Container::get( 'db.factory' 
)->getDB( DB_SLAVE );
+                                       $conditions[] = 'workflow_page_id <= ' 
. $dbr->addQuotes( $endId );
+                               }
+                               $revisions = $updater->getRevisions( 
$conditions, $options );
+
+                               // stop if we're all out of revisions
+                               if ( !$revisions ) {
+                                       break;
+                               }
+
+                               var_dump($revisions);
+                               // @todo: all of this is to fetch the revs - 
steal code from dumpFrom to output it
+
+//                             $total += $updater->updateRevisions( 
$revisions, null, null );
+//                             $this->output( "Indexed $total $updaterType 
document(s)\n" );
+
+                               // prepare for next batch, starting at the next 
id
+                               // prevFromId will default to around unix epoch 
- there can be
+                               // no data before that
+                               $prevStartId = $revStartId ?: 
UUID::getComparisonUUID( '1' );
+                               $revStartId = $this->getNextFromId( $revisions 
);
+
+                               // make sure we don't get stuck in an infinite 
loop
+                               $diff = $prevStartId->getTimestampObj()->diff( 
$revStartId->getTimestampObj() );
+                               // invert will be 1 if the diff is a negative 
time period from
+                               // $prevFromId to $fromId, which means that the 
new $timestamp is
+                               // more recent than our current $result
+                               if ( $diff->invert ) {
+                                       $this->error(
+                                               'Got stuck in an infinite 
loop.' . "\n" .
+                                               'workflow_last_update_timestamp 
is likely incorrect ' .
+                                               'for some workflows.' . "\n" .
+                                               'Run 
maintenance/FlowFixWorkflowLastUpdateTimestamp.php ' .
+                                               'to automatically fix those.', 
1 );
+                               }
+
+                               // prevent memory from being filled up
+                               Container::get( 'storage' )->clear();
+                       }
+               }
+       }
+
+       /**
+        * @param AbstractRevision[] $revisions
+        * @return UUID
+        */
+       protected function getNextFromId( array $revisions ) {
+               /** @var AbstractRevision $last */
+               $last = end( $revisions );
+
+               if ( $last instanceof \Flow\Model\Header ) {
+                       $timestamp = $last->getRevisionId()->getTimestampObj();
+               } else {
+                       $timestamp = 
$last->getCollection()->getWorkflow()->getLastUpdatedObj();
+               }
+
+               // $timestamp is the timestamp of the last revision we fetched. 
fromId
+               // is inclusive, and we don't want to include what we already 
have here,
+               // so we'll advance 1 more and call that the next fromId
+               $timestamp = (int) $timestamp->getTimestamp( TS_UNIX );
+               return UUID::getComparisonUUID( $timestamp + 1 );
+       }
+
+       /**
+        * @param string $cond
+        * @throws MWException
+        * @throws Exception
+        */
+       public function dumpFrom( $cond = '' ) {
+               # For logging dumps...
+               if ( $this->history & self::LOGS ) {
+                       $where = array( 'user_id = log_user' );
+                       # Hide private logs
+                       $hideLogs = LogEventsList::getExcludeClause( $this->db 
);
+                       if ( $hideLogs ) {
+                               $where[] = $hideLogs;
+                       }
+                       # Add on any caller specified conditions
+                       if ( $cond ) {
+                               $where[] = $cond;
+                       }
+                       # Get logging table name for logging.* clause
+                       $logging = $this->db->tableName( 'logging' );
+
+                       if ( $this->buffer == WikiExporter::STREAM ) {
+                               $prev = $this->db->bufferResults( false );
+                       }
+                       $result = null; // Assuring $result is not undefined, 
if exception occurs early
+                       try {
+                               $result = $this->db->select( array( 'logging', 
'user' ),
+                                       array( "{$logging}.*", 'user_name' ), 
// grab the user name
+                                       $where,
+                                       __METHOD__,
+                                       array( 'ORDER BY' => 'log_id', 'USE 
INDEX' => array( 'logging' => 'PRIMARY' ) )
+                               );
+                               $this->outputLogStream( $result );
+                               if ( $this->buffer == WikiExporter::STREAM ) {
+                                       $this->db->bufferResults( $prev );
+                               }
+                       } catch ( Exception $e ) {
+                               // Throwing the exception does not reliably 
free the resultset, and
+                               // would also leave the connection in 
unbuffered mode.
+
+                               // Freeing result
+                               try {
+                                       if ( $result ) {
+                                               $result->free();
+                                       }
+                               } catch ( Exception $e2 ) {
+                                       // Already in panic mode -> ignoring 
$e2 as $e has
+                                       // higher priority
+                               }
+
+                               // Putting database back in previous buffer mode
+                               try {
+                                       if ( $this->buffer == 
WikiExporter::STREAM ) {
+                                               $this->db->bufferResults( $prev 
);
+                                       }
+                               } catch ( Exception $e2 ) {
+                                       // Already in panic mode -> ignoring 
$e2 as $e has
+                                       // higher priority
+                               }
+
+                               // Inform caller about problem
+                               throw $e;
+                       }
+                       # For page dumps...
+               } else {
+                       $tables = array( 'page', 'revision' );
+                       $opts = array( 'ORDER BY' => 'page_id ASC' );
+                       $opts['USE INDEX'] = array();
+                       $join = array();
+                       if ( is_array( $this->history ) ) {
+                               # Time offset/limit for all pages/history...
+                               $revJoin = 'page_id=rev_page';
+                               # Set time order
+                               if ( $this->history['dir'] == 'asc' ) {
+                                       $op = '>';
+                                       $opts['ORDER BY'] = 'rev_timestamp ASC';
+                               } else {
+                                       $op = '<';
+                                       $opts['ORDER BY'] = 'rev_timestamp 
DESC';
+                               }
+                               # Set offset
+                               if ( !empty( $this->history['offset'] ) ) {
+                                       $revJoin .= " AND rev_timestamp $op " .
+                                               $this->db->addQuotes( 
$this->db->timestamp( $this->history['offset'] ) );
+                               }
+                               $join['revision'] = array( 'INNER JOIN', 
$revJoin );
+                               # Set query limit
+                               if ( !empty( $this->history['limit'] ) ) {
+                                       $opts['LIMIT'] = intval( 
$this->history['limit'] );
+                               }
+                       } elseif ( $this->history & WikiExporter::FULL ) {
+                               # Full history dumps...
+                               $join['revision'] = array( 'INNER JOIN', 
'page_id=rev_page' );
+                       } elseif ( $this->history & WikiExporter::CURRENT ) {
+                               # Latest revision dumps...
+                               if ( $this->list_authors && $cond != '' ) { // 
List authors, if so desired
+                                       $this->do_list_authors( $cond );
+                               }
+                               $join['revision'] = array( 'INNER JOIN', 
'page_id=rev_page AND page_latest=rev_id' );
+                       } elseif ( $this->history & WikiExporter::STABLE ) {
+                               # "Stable" revision dumps...
+                               # Default JOIN, to be overridden...
+                               $join['revision'] = array( 'INNER JOIN', 
'page_id=rev_page AND page_latest=rev_id' );
+                               # One, and only one hook should set this, and 
return false
+                               if ( Hooks::run( 
'WikiExporter::dumpStableQuery', array( &$tables, &$opts, &$join ) ) ) {
+                                       throw new MWException( __METHOD__ . " 
given invalid history dump type." );
+                               }
+                       } elseif ( $this->history & WikiExporter::RANGE ) {
+                               # Dump of revisions within a specified range
+                               $join['revision'] = array( 'INNER JOIN', 
'page_id=rev_page' );
+                               $opts['ORDER BY'] = array( 'rev_page ASC', 
'rev_id ASC' );
+                       } else {
+                               # Unknown history specification parameter?
+                               throw new MWException( __METHOD__ . " given 
invalid history dump type." );
+                       }
+                       # Query optimization hacks
+                       if ( $cond == '' ) {
+                               $opts[] = 'STRAIGHT_JOIN';
+                               $opts['USE INDEX']['page'] = 'PRIMARY';
+                       }
+                       # Build text join options
+                       if ( $this->text != WikiExporter::STUB ) { // 1-pass
+                               $tables[] = 'text';
+                               $join['text'] = array( 'INNER JOIN', 
'rev_text_id=old_id' );
+                       }
+
+                       if ( $this->buffer == WikiExporter::STREAM ) {
+                               $prev = $this->db->bufferResults( false );
+                       }
+
+                       $result = null; // Assuring $result is not undefined, 
if exception occurs early
+                       try {
+                               Hooks::run( 'ModifyExportQuery',
+                                       array( $this->db, &$tables, &$cond, 
&$opts, &$join ) );
+
+                               # Do the query!
+                               $result = $this->db->select( $tables, '*', 
$cond, __METHOD__, $opts, $join );
+                               # Output dump results
+                               $this->outputPageStream( $result );
+
+                               if ( $this->buffer == WikiExporter::STREAM ) {
+                                       $this->db->bufferResults( $prev );
+                               }
+                       } catch ( Exception $e ) {
+                               // Throwing the exception does not reliably 
free the resultset, and
+                               // would also leave the connection in 
unbuffered mode.
+
+                               // Freeing result
+                               try {
+                                       if ( $result ) {
+                                               $result->free();
+                                       }
+                               } catch ( Exception $e2 ) {
+                                       // Already in panic mode -> ignoring 
$e2 as $e has
+                                       // higher priority
+                               }
+
+                               // Putting database back in previous buffer mode
+                               try {
+                                       if ( $this->buffer == 
WikiExporter::STREAM ) {
+                                               $this->db->bufferResults( $prev 
);
+                                       }
+                               } catch ( Exception $e2 ) {
+                                       // Already in panic mode -> ignoring 
$e2 as $e has
+                                       // higher priority
+                               }
+
+                               // Inform caller about problem
+                               throw $e;
+                       }
+               }
+       }
+}
+
+class FlowBackupDumper extends BackupDumper {
+       function dump( $history, $text = FlowExporter::TEXT ) {
+               # Notice messages will foul up your XML output even if they're
+               # relatively harmless.
+               if ( ini_get( 'display_errors' ) ) {
+                       ini_set( 'display_errors', 'stderr' );
+               }
+
+               $this->initProgress( $history );
+
+               $db = $this->backupDb();
+               $exporter = new FlowExporter( $db, $history, 
FlowExporter::STREAM, $text );
+               $exporter->dumpUploads = $this->dumpUploads; // @todo
+               $exporter->dumpUploadFileContents = 
$this->dumpUploadFileContents; // @todo
+
+               $wrapper = new ExportProgressFilter( $this->sink, $this );
+               $exporter->setOutputSink( $wrapper );
+
+               if ( !$this->skipHeader ) {
+                       $exporter->openStream();
+               }
+
+               $revStartId = $this->revStartId ? UUID::create( 
$this->revStartId ) : null;
+               $revEndId = $this->revEndId ? UUID::create( $this->revEndId ) : 
null;
+               $exporter->dump( $this->pages, $this->startId, $this->endId, 
$revStartId, $revEndId );
+
+               if ( !$this->skipFooter ) {
+                       $exporter->closeStream();
+               }
+
+               $this->report( true );
+       }
+}
+
+$dumper = new FlowBackupDumper( $argv );
+
+if ( isset( $options['quiet'] ) ) {
+       $dumper->reporting = false;
+}
+
+if ( isset( $options['pagelist'] ) ) {
+       $olddir = getcwd();
+       chdir( $originalDir );
+       $pages = file( $options['pagelist'] );
+       chdir( $olddir );
+       if ( $pages === false ) {
+               echo "Unable to open file {$options['pagelist']}\n";
+               die( 1 );
+       }
+       $pages = array_map( 'trim', $pages );
+       $dumper->pages = array_filter( $pages, create_function( '$x', 'return 
$x !== "";' ) );
+}
+
+if ( isset( $options['start'] ) ) {
+       $dumper->startId = intval( $options['start'] );
+}
+if ( isset( $options['end'] ) ) {
+       $dumper->endId = intval( $options['end'] );
+}
+
+if ( isset( $options['revstart'] ) ) {
+       $dumper->revStartId = intval( $options['revstart'] );
+}
+if ( isset( $options['revend'] ) ) {
+       $dumper->revEndId = intval( $options['revend'] );
+}
+$dumper->skipHeader = isset( $options['skip-header'] );
+$dumper->skipFooter = isset( $options['skip-footer'] );
+$dumper->dumpUploads = isset( $options['uploads'] );
+$dumper->dumpUploadFileContents = isset( $options['include-files'] );
+
+$textMode = isset( $options['stub'] ) ? WikiExporter::STUB : 
WikiExporter::TEXT;
+
+if ( isset( $options['full'] ) ) {
+       $dumper->dump( WikiExporter::FULL, $textMode );
+} elseif ( isset( $options['current'] ) ) {
+       $dumper->dump( WikiExporter::CURRENT, $textMode );
+} elseif ( isset( $options['revrange'] ) ) {
+       $dumper->dump( WikiExporter::RANGE, $textMode );
+} else {
+       $dumper->progress( <<<ENDS
+This script dumps the wiki page or logging database into an
+XML interchange wrapper format for export or backup.
+
+XML output is sent to stdout; progress reports are sent to stderr.
+
+WARNING: this is not a full database dump! It is merely for public export
+         of your wiki. For full backup, see our online help at:
+         https://www.mediawiki.org/wiki/Backup
+
+Usage: php dumpBackup.php <action> [<options>]
+Actions:
+  --full      Dump all revisions of every page.
+  --current   Dump only the latest revision of every page.
+  --pagelist=<file>
+              Where <file> is a list of page titles to be dumped
+  --revrange  Dump specified range of revisions, requires
+              revstart and revend options.
+Options:
+  --quiet     Don't dump status reports to stderr.
+  --report=n  Report position and speed after every n pages processed.
+              (Default: 100)
+  --server=h  Force reading from MySQL server h
+  --start=n   Start from page_id or log_id n
+  --end=n     Stop before page_id or log_id n (exclusive)
+  --revstart=n  Start from rev_id n
+  --revend=n    Stop before rev_id n (exclusive)
+  --skip-header Don't output the <mediawiki> header
+  --skip-footer Don't output the </mediawiki> footer
+  --stub      Don't perform old_text lookups; for 2-pass dump
+  --uploads   Include upload records without files
+  --include-files Include files within the XML stream
+  --conf=<file> Use the specified configuration file (LocalSettings.php)
+
+  --wiki=<wiki>  Only back up the specified <wiki>
+
+Fancy stuff: (Works? Add examples please.)
+  --plugin=<class>[:<file>]   Load a dump plugin class
+  --output=<type>:<file>      Begin a filtered output stream;
+                              <type>s: file, gzip, bzip2, 7zip
+  --filter=<type>[:<options>] Add a filter on an output branch
+
+ENDS
+       );
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/242569
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I52bc7c0ce7813a78f9006ca4b7d931a905726c05
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Flow
Gerrit-Branch: master
Gerrit-Owner: Matthias Mullie <mmul...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] [WIP] Dump Flow data - change (mediawiki...Flow)

Reply via email to