Martineznovo has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/361207 )
Change subject: grabFiles: Robustness, code style and features ...................................................................... grabFiles: Robustness, code style and features - Use class member variables instead of globals everywhere - Using max api limits instead of hardcoded values - Check only local repo for existing files - End date cutoff changed to not exclude the entire file revision history, but just new versions of files - Added revdeletion handling from text grabbers - Added new parameter "wikia" to handle this special host mangling original files - Download file and store has been rewritten to use MWHttpRequest for download the file and check the sha1 of the file, with multiple retries and possible cache bypass, and use MediaWiki classes to store files, which use local repo configuration from LocalSettings.php to store the file where it should be instead of using dumb assumptions about location based on the remote location. It should even support custom file backends now. Change-Id: Ie9ce0fafaae6a9f4ffe92123806b891c7bb0e760 --- M grabFiles.php 1 file changed, 349 insertions(+), 119 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/tools/grabbers refs/changes/07/361207/1 diff --git a/grabFiles.php b/grabFiles.php index d19ffba..b7e9ec7 100755 --- a/grabFiles.php +++ b/grabFiles.php @@ -6,7 +6,9 @@ * @file * @ingroup Maintenance * @author Calimonious the Estrange + * @author Jesús Martínez <[email protected]> * @date 31 December 2012 + * @version 1.0 * @note Based on code by Misza, Jack Phoenix and Edward Chernenko. */ @@ -20,6 +22,49 @@ require_once 'mediawikibot.class.php'; class GrabFiles extends Maintenance { + + /** + * End date + * + * @var string + */ + protected $endDate; + + /** + * Handle to the database connection + * + * @var DatabaseBase + */ + protected $dbw; + + /** + * MediaWikiBot instance + * + * @var MediaWikiBot + */ + protected $bot; + + /** + * Local file repository + * + * @var LocalRepo + */ + protected $localRepo; + + /** + * Temporal file handle + * + * @var FileHandle + */ + protected $mTmpHandle; + + /** + * The target wiki is on Wikia + * + * @var boolean + */ + protected $isWikia; + public function __construct() { parent::__construct(); $this->mDescription = 'Grabs files from a pre-existing wiki into a new wiki. Assumes a normal file hashing structure on each end.'; @@ -29,46 +74,54 @@ $this->addOption( 'db', 'Database name, if we don\'t want to write to $wgDBname', false, true ); $this->addOption( 'from', 'Name of file to start from', false, true ); $this->addOption( 'enddate', 'Date after which to ignore new files (20121222142317, 2012-12-22T14:23:17T, etc)', false, true ); + $this->addOption( 'wikia', 'Set this param if the target wiki is on Wikia, which needs to handle URLs in a special way', false, false ); } public function execute() { - global $wgUploadDirectory, $endDate; - - $endDate = $this->getOption( 'enddate' ); - if ( $endDate ) { - $endDate = wfTimestamp( TS_MW, $endDate ); - if ( !$endDate ) { - $this->error( "Invalid enddate format.\n", true ); - } - } else { - $endDate = wfTimestampNow(); - } + global $wgDBname; $url = $this->getOption( 'url' ); if ( !$url ) { - $this->error( 'The URL to the target wiki\'s api.php is required.', true ); + $this->error( 'The URL to the target wiki\'s api.php is required.', 1 ); } + + $this->endDate = $this->getOption( 'enddate' ); + if ( $this->endDate ) { + $this->endDate = wfTimestamp( TS_MW, $this->endDate ); + if ( !$this->endDate ) { + $this->error( "Invalid enddate format.\n", 1 ); + } + } else { + $this->endDate = wfTimestampNow(); + } + + # Get a single DB_MASTER connection + $this->dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', $wgDBname ) ); + + # Get a local repo instance + $this->localRepo = RepoGroup::singleton()->getLocalRepo(); + + $this->isWikia = $this->getOption( 'wikia' ); + $user = $this->getOption( 'username' ); $password = $this->getOption( 'password' ); - $this->output( "Working...\n" ); - # bot class and log in if requested if ( $user && $password ) { - $bot = new MediaWikiBot( + $this->bot = new MediaWikiBot( $url, 'json', $user, $password, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:13.0) Gecko/20100101 Firefox/13.0.1' ); - if ( !$bot->login() ) { + if ( !$this->bot->login() ) { $this->output( "Logged in as $user...\n" ); } else { - $this->output( "WARNING: Failed to log in as $user.\n" ); + $this->error( "Failed to log in as $user.\n", 1 ); } } else { - $bot = new MediaWikiBot( + $this->bot = new MediaWikiBot( $url, 'json', '', @@ -79,15 +132,19 @@ $params = array( 'generator' => 'allimages', - 'gailimit' => 500, + 'gailimit' => 'max', 'prop' => 'imageinfo', 'iiprop' => 'timestamp|user|userid|comment|url|size|sha1|mime|metadata|archivename|bitdepth|mediatype', - 'iilimit' => 500 + 'iilimit' => 'max' ); $gaifrom = $this->getOption( 'from' ); $more = true; $count = 0; + + if ( $gaifrom !== null ) { + $params['gaifrom'] = $gaifrom; + } $this->output( "Processing and downloading files...\n" ); while ( $more ) { @@ -96,9 +153,9 @@ } else { $params['gaifrom'] = $gaifrom; } - $result = $bot->query( $params ); + $result = $this->bot->query( $params ); if ( empty( $result['query']['pages'] ) ) { - $this->error( 'No files found...', true ); + $this->error( 'No files found...', 1 ); } foreach ( $result['query']['pages'] as $file ) { @@ -115,25 +172,40 @@ $this->output( "$count files downloaded.\n" ); } + /** + * Process the information from a given file returned by the api + * + * @param array $entry Page data returned from the api with imageinfo + * @return int Number of image revisions processed. + */ function processFile( $entry ) { - global $wgDBname, $wgUploadDirectory, $endDate; + global $wgLang; - $name = $entry['title']; - $name = preg_replace( '/^[^:]*?:/', '', $name ); - $name = str_replace( ' ', '_', $name ); + $name = $this->sanitiseTitle( $entry['ns'], $entry['title'] ); # Check if file already exists. - $file = wfFindFile( $name ); + # NOTE: wfFindFile() checks foreign repos too. Use local repo only + $file = $this->localRepo->findFile( $name ); if ( is_object( $file ) ) { return 0; } - $this->output( "Processing {$entry['title']}: " ); + $this->output( "Processing {$name}: " ); $count = 0; foreach ( $entry['imageinfo'] as $fileVersion ) { - if ( !$count && $endDate < wfTimestamp( TS_MW, $fileVersion['timestamp'] ) ) { - return 0; + # Api returns file revisions from new to old. + # WARNING: If a new version of a file is uploaded after the start of the script + # (or endDate), the file and all its previous revisions would be skipped, + # potentially leaving pages that were using the old image with redlinks. + # To prevent this, we'll skip only more recent versions, and mark the first + # one before the end date as the latest + if ( !$count && wfTimestamp( TS_MW, $fileVersion['timestamp'] ) > $this->endDate ) { + #return 0; + continue; + } + if ( !$count && isset( $fileVersion['archivename'] ) ) { + unset( $fileVersion['archivename'] ); } # Check for Wikia's videos @@ -148,111 +220,127 @@ ) { $this->output( "...this appears to be a video, skipping it.\n" ); + return 0; + } + + # Sloppy handler for revdeletions; just fills them in with dummy text + # and sets bitfield thingy + $filedeleted = 0; + if ( isset( $fileVersion['userhidden'] ) ) { + $filedeleted = $filedeleted | File::DELETED_USER; + if ( !isset( $fileVersion['user'] )) { + $fileVersion['user'] = ''; # username removed + } + if ( !isset( $fileVersion['userid'] )) { + $fileVersion['userid'] = 0; + } + } + if ( isset( $fileVersion['commenthidden'] ) ) { + $filedeleted = $filedeleted | File::DELETED_COMMENT; + $comment = ''; # edit summary removed + } else { + $comment = $fileVersion['comment']; + if ( $comment ) { + $comment = $wgLang->truncate( $comment, 255 ); + } else { + $comment = ''; + } + } + if ( isset( $fileVersion['filehidden'] ) ) { + $filedeleted = $filedeleted | File::DELETED_FILE; + } + if ( isset ( $fileVersion['suppressed'] ) ) { + $filedeleted = $filedeleted | File::DELETED_RESTRICTED; + } + + if ( !isset( $fileVersion['url'] ) ) { + # If the file is supressed and we don't have permissions, + # we won't get URL nor MIME. MIME is a required field, + # skip the file revision instead of crashing + $this->output( "File supressed, skipping it\n" ); continue; } $fileurl = $fileVersion['url']; - // Check for the presence of Wikia's Vignette's parameters and - // if they're there, remove 'em to ensure that the files are - // saved under their correct names. - // @see http://community.wikia.com/wiki/User_blog:Nmonterroso/Introducing_Vignette,_Wikia%27s_New_Thumbnailer - if ( preg_match( '/\/revision\/latest\?cb=(.*)$/', $fileurl, $matches ) ) { - $fileurl = preg_replace( '/\/revision\/latest\?cb=(.*)$/', '', $fileurl ); + if ( $this->isWikia ) { + # Wikia is now serving "optimised" lossy images instead of the originals + # See http://community.wikia.com/wiki/Thread:1200407 + # Add format=original to the URL to hopefully force it to download the original + if ( strpos( $fileurl, '?' ) !== false ) { + $fileurl .= '&format=original'; + } else { + $fileurl .= '?format=original'; + } } + + $file_e = array( + 'name' => $name, + 'size' => $fileVersion['size'], + 'width' => $fileVersion['width'], + 'height' => $fileVersion['height'], + 'bits' => $fileVersion['bitdepth'], + 'description' => $comment, + 'user' => $fileVersion['userid'], + 'user_text' => $fileVersion['user'], + 'timestamp' => wfTimestamp( TS_MW, $fileVersion['timestamp'] ), + 'media_type' => $fileVersion['mediatype'], + 'deleted' => $filedeleted, + 'sha1' => Wikimedia\base_convert( $fileVersion['sha1'], 16, 36, 31 ), + 'metadata' => serialize( $this->processMetaData( $fileVersion['metadata'] ) ), + ); + + $mime = $fileVersion['mime']; + $mimeBreak = strpos( $mime, '/' ); + $file_e['major_mime'] = substr( $mime, 0, $mimeBreak ); + $file_e['minor_mime'] = substr( $mime, $mimeBreak + 1 ); if ( isset( $fileVersion['archivename'] ) ) { # Old version + $this->dbw->begin(); $e = array( 'oi_name' => $name, 'oi_archive_name' => $fileVersion['archivename'], - 'oi_size' => $fileVersion['size'], - 'oi_width' => $fileVersion['width'], - 'oi_height' => $fileVersion['height'], - 'oi_bits' => $fileVersion['bitdepth'], - 'oi_description' => $fileVersion['comment'], - 'oi_user' => $fileVersion['userid'], - 'oi_user_text' => $fileVersion['user'], - 'oi_timestamp' => wfTimestamp( TS_MW, $fileVersion['timestamp'] ), - 'oi_media_type' => $fileVersion['mediatype'], - 'oi_deleted' => 0, - 'oi_sha1' => $fileVersion['sha1'], - 'oi_metadata' => serialize( $fileVersion['metadata'] ) + 'oi_size' => $file_e['size'], + 'oi_width' => $file_e['width'], + 'oi_height' => $file_e['height'], + 'oi_bits' => $file_e['bits'], + 'oi_description' => $file_e['description'], + 'oi_user' => $file_e['user'], + 'oi_user_text' => $file_e['user_text'], + 'oi_timestamp' => $file_e['timestamp'], + 'oi_media_type' => $file_e['media_type'], + 'oi_deleted' => $file_e['deleted'], + 'oi_sha1' => $file_e['sha1'], + 'oi_metadata' => $file_e['metadata'], + 'oi_major_mime' => $file_e['major_mime'], + 'oi_minor_mime' => $file_e['minor_mime'] ); - - $mime = $fileVersion['mime']; - $mimeBreak = strpos( $mime, '/' ); - $e['oi_major_mime'] = substr( $mime, 0, $mimeBreak ); - $e['oi_minor_mime'] = substr( $mime, $mimeBreak + 1 ); - - $dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', $wgDBname ) ); - $dbw->begin(); - $dbw->insert( 'oldimage', $e, __METHOD__ ); - $dbw->commit(); - - $urlparts = explode( '/', $fileurl ); - $urli = count( $urlparts ); - - $fileLocalPath = $wgUploadDirectory . '/archive/' . $urlparts[$urli - 3] . '/' . $urlparts[$urli - 2] . '/' . $name; - $fileLocalDir = $wgUploadDirectory . '/archive/' . $urlparts[$urli - 3] . '/' . $urlparts[$urli - 2] . '/'; + $this->dbw->insert( 'oldimage', $e, __METHOD__ ); + $this->storeFileFromURL( $name, $fileurl, $file_e['timestamp'] ); + $this->dbw->commit(); } else { # Current version - # Check if title is present in database because someone screwed up - $dbr = wfGetDB( DB_SLAVE, array(), $this->getOption( 'db', $wgDBname ) ); - $dbr->begin(); - $result = $dbr->select( - 'image', - 'img_name', - array( 'img_name' => $name ), - __METHOD__ + $this->dbw->begin(); + $e = array( + 'img_name' => $name, + 'img_size' => $file_e['size'], + 'img_width' => $file_e['width'], + 'img_height' => $file_e['height'], + 'img_bits' => $file_e['bits'], + 'img_description' => $file_e['description'], + 'img_user' => $file_e['user'], + 'img_user_text' => $file_e['user_text'], + 'img_timestamp' => $file_e['timestamp'], + 'img_media_type' => $file_e['media_type'], + 'img_sha1' => $file_e['sha1'], + 'img_metadata' => $file_e['metadata'], + 'img_major_mime' => $file_e['major_mime'], + 'img_minor_mime' => $file_e['minor_mime'] ); - $dbr->commit(); - if ( !$dbr->fetchObject( $result ) ) { - $e = array( - 'img_name' => $name, - 'img_size' => $fileVersion['size'], - 'img_width' => $fileVersion['width'], - 'img_height' => $fileVersion['height'], - 'img_metadata' => serialize( $fileVersion['metadata'] ), - 'img_bits' => $fileVersion['bitdepth'], - 'img_media_type' => $fileVersion['mediatype'], - 'img_description' => $fileVersion['comment'], - 'img_user' => $fileVersion['userid'], - 'img_user_text' => $fileVersion['user'], - 'img_timestamp' => wfTimestamp( TS_MW, $fileVersion['timestamp'] ), - 'img_sha1' => $fileVersion['sha1'] - ); - - $mime = $fileVersion['mime']; - $mimeBreak = strpos( $mime, '/' ); - $e['img_major_mime'] = substr( $mime, 0, $mimeBreak ); - $e['img_minor_mime'] = substr( $mime, $mimeBreak + 1 ); - - $dbw = wfGetDB( DB_MASTER, array(), $this->getOption( 'db', $wgDBname ) ); - - $dbw->insert( 'image', $e, __METHOD__ ); - $dbw->commit(); - } - - $urlparts = explode( '/', $fileurl ); - $urli = count( $urlparts ); - - $fileLocalPath = $wgUploadDirectory . '/' . $urlparts[$urli - 3] . '/' . $urlparts[$urli - 2] . '/' . $name; - $fileLocalDir = $wgUploadDirectory . '/' . $urlparts[$urli - 3] . '/' . $urlparts[$urli - 2] . '/'; + $this->dbw->insert( 'image', $e, __METHOD__ ); + $this->storeFileFromURL( $name, $fileurl, false ); + $this->dbw->commit(); } - - wfSuppressWarnings(); - $fileContent = file_get_contents( $fileurl ); - wfRestoreWarnings(); - if ( !$fileContent ) { - $this->output( "$name not found on remote server.\n" ); - continue; - } - - # Directory structure and save - if ( !file_exists( $fileLocalDir ) ) { - mkdir( $fileLocalDir, 0777, true ); - } - file_put_contents( $fileLocalPath, $fileContent ); $count++; } @@ -264,6 +352,148 @@ return $count; } + + /** + * Stores the file from the URL to the local repository + * + * @param string $name Name of the file + * @param string $fileurl URL of the file to be downloaded + * @param int|boolean timestamp in case of old file or false otherwise + * @param string $sha1 sha of the file to ensure that it's not corrupt (optional) + * @return boolean true if the store succeeds + */ + function storeFileFromURL( $name, $fileurl, $timestamp, $sha1 = null ) { + $maxRetries = 3; # Just an arbitrary value + $downloadSuccess = false; + $retval = false; + $tmpPath = tempnam( wfTempDir(), 'grabfile' ); + $targeturl = $fileurl; + # Retry in case of download failure + for ( $retries = 0; !$downloadSuccess && $retries < $maxRetries; $retries++ ) { + if ( $retries > 0 ) { + # Maybe sha1 didn't match because an old version of the file + # is cached on the server. Try to append a random parameter + # to the URL to trick the server to get a fresh version + if ( strpos( $fileurl, '?' ) !== false ) { + $targeturl = "{$fileurl}&purge={$retries}"; + } else { + $targeturl = "{$fileurl}?purge={$retries}"; + } + # Also wait some time in case the server is temporarily unavailable + sleep( 20 * $retries ); + } + $downloadSuccess = $this->downloadFile( $targeturl, $tmpPath, $sha1 ); + } + if ( $downloadSuccess ) { + $file = $this->localRepo->newFile( $name, $timestamp ); + $status = $file->publish( $tmpPath ); + if ( $status->isOK() ) { + $retval = true; + } else { + $this->output( sprintf( "Error when publishing file %s to the local file repo: %s\n", + $name, implode( '. ', $status->getErrors() ) ) ); + } + } else { + $this->output( sprintf( "Failed to save file %s from URL %s\n", $name, $fileurl ) ); + } + unlink( $tmpPath ); + return $retval; + } + + /** + * Downloads a URL to a specified temporal file + * + * @param string $fileurl URL of the file to be downloaded + * @param string $targetTempFile path for the downloaded file + * @param string $sha1 sha of the file to ensure that it's not corrupt (optional) + * @return bool true if the operation succeeded + */ + function downloadFile( $fileurl, $targetTempFile, $sha1 = null ) { + $this->mTmpHandle = fopen( $targetTempFile, 'wb' ); + $req = MWHttpRequest::factory( $fileurl, array( 'timeout' => 90 ), __METHOD__ ); + $req->setCallback( [ $this, 'saveTempFileChunk' ] ); + $status = $req->execute(); + fclose( $this->mTmpHandle ); + if ( $status->isOK() ) { + if ( is_null( $sha1 ) ) { + return true; + } + # Check sha1 + $storedSha1 = Wikimedia\base_convert( sha1_file( $targetTempFile ), 16, 36, 31 ); + if ( $storedSha1 == $sha1 ) { + return true; + } + $this->output( sprintf( "File from URL %s doesn\'t match the expected sha1. Expected: %s. Actual: %s\n", + $fileurl, $sha1, $storedSha1 ) ); + } else { + $this->output( sprintf( "Error when saving contents of URL %s: %s\n", + $fileurl, implode( '. ', $status->getErrors() ) ) ); + } + return false; + } + + /** + * Callback: save a chunk of the result of a HTTP request to the temporary file + * Copied from UploadFromUrl + * + * @param mixed $req + * @param string $buffer + * @return int Number of bytes handled + */ + function saveTempFileChunk( $req, $buffer ) { + $nbytes = fwrite( $this->mTmpHandle, $buffer ); + + if ( $nbytes != strlen( $buffer ) ) { + // Well... that's not good! + $this->output( sprintf( "Short write %s/%s bytes, aborting.\n", + $nbytes, strlen( $buffer ) ), 1 ); + fclose( $this->mTmpHandle ); + $this->mTmpHandle = false; + } + + return $nbytes; + } + + /** + * Formats metadata to the original format stored by MediaWiki + * The api returns an array of objects {name: paramName, value: paramValue} + * but we want to store {paramName: paramValue} + * + * @param $metadata Array as retrieved from the api + * @returns array + */ + function processMetaData( $metadata ) { + $result = array(); + if ( !is_array( $metadata ) ) { + return $result; + } + foreach ( $metadata as $namevalue ) { + $name = $namevalue['name']; + $value = $namevalue['value']; + if ( is_array( $value ) ) { + $result[$name] = $this->processMetaData( $value ); + } else { + $result[$name] = $value; + } + } + return $result; + } + + /** + * Strips the namespace from the title, if namespace number is different than 0, + * and converts spaces to underscores. For use in database + * + * @param int $ns Namespace number + * @param string $title Title of the page with the namespace + */ + function sanitiseTitle( $ns, $title ) { + if ( $ns != 0 ) { + $title = preg_replace( '/^[^:]*?:/', '', $title ); + } + $title = str_replace( ' ', '_', $title ); + return $title; + } + } $maintClass = 'GrabFiles'; -- To view, visit https://gerrit.wikimedia.org/r/361207 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ie9ce0fafaae6a9f4ffe92123806b891c7bb0e760 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/tools/grabbers Gerrit-Branch: master Gerrit-Owner: Martineznovo <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
