Denny Vrandecic has uploaded a new change for review. https://gerrit.wikimedia.org/r/64081
Change subject: first draft of the RDF dump creating script (DO NOT MERGE) ...................................................................... first draft of the RDF dump creating script (DO NOT MERGE) Change-Id: I0a5122078772171c8f0b786953d11fbd458daef7 --- A repo/maintenance/transformXMLtoRDF.php 1 file changed, 200 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/81/64081/1 diff --git a/repo/maintenance/transformXMLtoRDF.php b/repo/maintenance/transformXMLtoRDF.php new file mode 100644 index 0000000..7fd0947 --- /dev/null +++ b/repo/maintenance/transformXMLtoRDF.php @@ -0,0 +1,200 @@ +<?php +/** + * Transform XML dump into RDF dump. + * + * Use as follows: + * + * php createRDFdump.php pages-meta-current.xml.bz2 + * + * Copyright © 2005 Brion Vibber <br...@pobox.com> + * http://www.mediawiki.org/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @since 0.4 + * + * @file + * @ingroup WikibaseRepo + * + * @licence GNU GPL v2+ + * @author Denny Vrandecic < vrande...@gmail.com > + */ + +$basePath = getenv( 'MW_INSTALL_PATH' ) !== false ? getenv( 'MW_INSTALL_PATH' ) : __DIR__ . '/../../../..'; + +require_once $basePath . '/maintenance/Maintenance.php'; +require_once $basePath . '/includes/Exception.php'; + +/** + * Maintenance script that transforms the XML dump into an RDF dump. + * + * @ingroup WikibaseRepo + */ +class transformXMLtoRDF extends Maintenance { + public $reportingInterval = 1000; + public $pageCount = 0; + public $revCount = 0; + public $propertyTypes = array(); + + function __construct() { + parent::__construct(); + $gz = in_array( 'compress.zlib', stream_get_wrappers() ) ? 'ok' : '(disabled; requires PHP zlib module)'; + $bz2 = in_array( 'compress.bzip2', stream_get_wrappers() ) ? 'ok' : '(disabled; requires PHP bzip2 module)'; + + $this->mDescription = <<<TEXT +This script reads pages from an XML file as produced from Special:Export or +dumpBackup.php, and transforms them into an RDF/XML file containing the knowledge base. + +Compressed XML files may be read directly: + .gz $gz + .bz2 $bz2 + .7z (if 7za executable is in PATH) +TEXT; + $this->stderr = fopen( "php://stderr", "wt" ); + $this->addOption( 'report', + 'Report position and speed after every n entities processed', false, true ); + $this->addOption( 'types', + 'file containing lines each with a space-separated tuple of property ID and type', + true, true ); + $this->addOption( 'output', 'name of the output RDF/XML file', false, true ); + $this->addOption( 'debug', 'Output extra verbose debug information' ); + $this->addArg( 'file', 'Dump file to transform [else use stdin]', false ); + } + + public function execute() { + $this->reportingInterval = intval( $this->getOption( 'report', 1000 ) ); + if ( !$this->reportingInterval ) { + $this->reportingInterval = 1000; // avoid division by zero + } + + if ( $this->hasArg() ) { + $this->importFromFile( $this->getArg() ); + } else { + $this->importFromStdin(); + } + + $this->output( "Done!\n" ); + } + + function reportPage( $page ) { + $this->pageCount++; + } + + /** + * @param $rev Revision + * @return mixed + */ + function handleRevision( WikiRevision $rev ) { + $title = $rev->getTitle(); + if ( !$title ) { + $this->progress( "Got bogus revision with null title!" ); + return; + } + $item = ( $rev->getModel() === CONTENT_MODEL_WIKIBASE_ITEM ); + $property = ( $rev->getModel() === CONTENT_MODEL_WIKIBASE_PROPERTY ); + $entity = $item OR $property; + + if ( $rev->getModel() !== 'wikibase-item' ) { + if ( $rev->getModel() !== 'wikitext' ) { + if ( $rev->getModel() !== 'javascript' ) { + if ( $rev->getModel() !== 'css' ) { + $this->progress( $rev->getModel() ); + } + } + } + } + + if ( $entity ) { + // TODO process the actual content $rev->getContent(); + } + + $this->revCount++; + $this->report(); + } + + function report( $final = false ) { + if ( $final xor ( $this->pageCount % $this->reportingInterval == 0 ) ) { + $this->showReport(); + } + } + + function showReport() { + if ( !$this->mQuiet ) { + $delta = microtime( true ) - $this->startTime; + if ( $delta ) { + $rate = sprintf( "%.2f", $this->pageCount / $delta ); + $revrate = sprintf( "%.2f", $this->revCount / $delta ); + } else { + $rate = '-'; + $revrate = '-'; + } + # Logs dumps don't have page tallies + if ( $this->pageCount ) { + $this->progress( "$this->pageCount ($rate pages/sec $revrate revs/sec)" ); + } else { + $this->progress( "$this->revCount ($revrate revs/sec)" ); + } + } + } + + function progress( $string ) { + fwrite( $this->stderr, $string . "\n" ); + } + + function importFromFile( $filename ) { + if ( preg_match( '/\.gz$/', $filename ) ) { + $filename = 'compress.zlib://' . $filename; + } elseif ( preg_match( '/\.bz2$/', $filename ) ) { + $filename = 'compress.bzip2://' . $filename; + } elseif ( preg_match( '/\.7z$/', $filename ) ) { + $filename = 'mediawiki.compress.7z://' . $filename; + } + + $file = fopen( $filename, 'rt' ); + return $this->importFromHandle( $file ); + } + + function importFromStdin() { + $file = fopen( 'php://stdin', 'rt' ); + if ( self::posix_isatty( $file ) ) { + $this->maybeHelp( true ); + } + return $this->importFromHandle( $file ); + } + + function importFromHandle( $handle ) { + $this->startTime = microtime( true ); + + $source = new ImportStreamSource( $handle ); + $importer = new WikiImporter( $source ); + + if ( $this->hasOption( 'debug' ) ) { + $importer->setDebug( true ); + } + $importer->setNoUpdates( true ); + + $importer->setPageCallback( array( &$this, 'reportPage' ) ); + $this->importCallback = $importer->setRevisionCallback( array( &$this, 'handleRevision' ) ); + $this->uploadCallback = $importer->setUploadCallback( null ); + $this->logItemCallback = $importer->setLogItemCallback( null ); + $importer->setPageOutCallback( null ); + + return $importer->doImport(); + } +} + +$maintClass = 'transformXMLtoRDF'; +require_once RUN_MAINTENANCE_IF_MAIN; -- To view, visit https://gerrit.wikimedia.org/r/64081 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0a5122078772171c8f0b786953d11fbd458daef7 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Denny Vrandecic <denny.vrande...@wikimedia.de> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits