Brian Wolff has uploaded a new change for review. https://gerrit.wikimedia.org/r/80403
Change subject: Extension to screen scrape metadata out of commons ...................................................................... Extension to screen scrape metadata out of commons Initial commit. I know the patch is rough in a couple places, and it needs caching. for use with https://gerrit.wikimedia.org/r/#/c/78926/ Change-Id: I5e6bc45f9751641e16426231dabcc8277b86fee0 --- A CommonsMetadata.i18n.php A CommonsMetadata.php A CommonsMetadata_body.php 3 files changed, 295 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CommonsMetadata refs/changes/03/80403/1 diff --git a/CommonsMetadata.i18n.php b/CommonsMetadata.i18n.php new file mode 100644 index 0000000..6f5a756 --- /dev/null +++ b/CommonsMetadata.i18n.php @@ -0,0 +1,20 @@ +<?php +/** + * Internationalisation file for extension CommonsMetadata. + * + * @file + * @ingroup Extensions + */ + +$messages= array(); + +$messages['en'] = array( + 'commonsmetadata-desc' => 'Extends the "extmetadata" prop of the image info API module to include information stored in image description pages that use the templates commonly used on Wikimedia Commons.', +); + +/** Message documentation (Message documentation) + * @author Bawolff + */ +$messages['qqq'] = array( + 'commonsmetadata-desc' => '{{desc}}', +); diff --git a/CommonsMetadata.php b/CommonsMetadata.php new file mode 100644 index 0000000..0570e3d --- /dev/null +++ b/CommonsMetadata.php @@ -0,0 +1,31 @@ +<?php +# Extends the extmetadata propery of image info API module to include +# details from file description pages that use commons style templates. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# http://www.gnu.org/copyleft/gpl.html + +$wgExtensionCredits['other'][] = array( + 'path' => __FILE__, + 'name' => 'CommonsMetadata', + 'author' => 'Brian Wolff', + 'url' => '//www.mediawiki.org/wiki/Extension:CommonsMetadata', + 'descriptionmsg' => 'commonsmetadata-desc', +); +$wgAutoloadClasses['CommonsMeta'] = __DIR__ . '/CommonsMetadata_body.php'; +$wgExtensionMessagesFiles['CommonsMetadata'] = __DIR__ . '/CommonsMetadata.i18n.php'; + +$wgHooks['GetExtendedMetadata'][] = 'CommonsMeta::onGetExtendedMetadata'; + diff --git a/CommonsMetadata_body.php b/CommonsMetadata_body.php new file mode 100644 index 0000000..5cd1c7f --- /dev/null +++ b/CommonsMetadata_body.php @@ -0,0 +1,244 @@ +<?php +class CommonsMeta { + + private $xmlParser; + private $state = self::STATE_INITIAL; + private $text; + private $propName; + private $tdDepth = 0; + private $divDepth = 0; + private $finalProps = array(); + private $curExtractionLang = ''; + private $extractionLang; + private $langText = ''; + private $curLangText = ''; + private $fallbackLangs; + private $targetLang = false; // false for all, language code otherwise. + private $langTextPriority = 2000; + + const STATE_INITIAL = 1; + const STATE_NEXTTD = 2; + const STATE_CAPTURE_TEXT = 3; + const STATE_CAPTURE_LANG = 4; + + public static function getMetadata( $doc, $lang = false ) { + $obj = new CommonsMeta; + $obj->setLanguage( $lang ); + return $obj->parsePage( $doc ); + } + + public static function onGetExtendedMetadata( $combinedMeta, $file, $context, $singleLang ) { + $lang = $context->getLanguage()->getCode(); + $query = array( 'action' => 'render', 'uselang' => $lang ); + $descriptionUrl = $file->getDescriptionUrl(); + $descriptionUrl = wfAppendQuery( $descriptionUrl, $query ); + $descriptionUrl = wfExpandUrl( $descriptionUrl, PROTO_CANONICAL ); + + $descriptionText = Http::get( $descriptionUrl ); + + if ( $singleLang ) { + $data = self::getMetadata( $descriptionText, $lang ); + } else { + $data = self::getMetadata( $descriptionText ); + } + + foreach( $data as $name => $value ) { + $combinedMeta[ $name ] = array( + 'value' => $value, + 'source' => 'commons-desc-page' + ); + } + + return true; + } + + /** + * Set the language to fetch + * @param String|boolean $lang Language code or false for all langs. + */ + public function setLanguage( $lang ) { + if ( !Language::isValidCode( $lang ) ) { + // Maybe do more strict isValidBuiltinCode? + throw new MWException( 'Invalid language code specified' ); + } + $this->targetLang = $lang; + } + + function __construct() { + if ( !function_exists( 'xml_parser_create' ) ) { + throw new MWException( 'No XML parser support' ); + } + $this->xmlParser = xml_parser_create( 'UTF-8' ); + if ( !$this->xmlParser ) { + throw new MWException( 'Could not create parser' ); + } + xml_set_character_data_handler( $this->xmlParser, array( $this, 'char' ) ); + xml_set_element_handler( $this->xmlParser, array( $this, 'elmStart' ), array( $this, 'elmEnd' ) ); + } + + public function parsePage( $html ) { + xml_parse( $this->xmlParser, "<renderout>$html</renderout>" ); + return $this->finalProps; + } + + public function elmStart( $parser, $name, array $attribs ) { + switch ( $this->state ) { + case self::STATE_INITIAL: + if ( $name === 'TD' ) { + $this->elmStartInitial( $attribs ); + } + break; + case self::STATE_NEXTTD: + if ( $name === 'TD' ) { + if ( $this->tdDepth <= 0 ) { + $this->state = self::STATE_CAPTURE_TEXT; + $this->tdDepth = 1; + $this->text = ''; + $this->langText = ''; + $this->extractionLang = ''; + } else { + $this->tdDepth++; + } + } + break; + case self::STATE_CAPTURE_TEXT: + if ( $name === 'TD' ) { + $this->tdDepth++; + } + // FIXME inline language tags. + if ( $this->targetLang && $name === 'DIV' && isset( $attribs['CLASS'] ) && isset( $attribs['LANG'] ) ) { + if ( preg_match( '/(?:^|\s)description(?:\s|$)/', $attribs['CLASS'] ) ) { + $this->state = self::STATE_CAPTURE_LANG; + $this->curExtractionLang = $attribs['LANG']; + $this->curLangText = ''; + $this->divDepth = 1; + } + } + $this->text .= Html::openElement( $name, $attribs ); + break; + case self::STATE_CAPTURE_LANG: + if ( $name === 'TD' ) { + $this->tdDepth++; + } elseif( $name === 'DIV' ) { + $this->divDepth++; + } + $this->curLangText .= Html::openElement( $name, $attribs ); + break; + } + } + + public function elmStartInitial( $attribs ) { + $nextTdIds = array( + 'fileinfotpl_desc' => 'ImageDescription', + 'fileinfotpl_date' => 'DateTimeOriginal', + 'fileinfotpl_aut' => 'Artist', + # Credit (iptc 2:110) describes the "provider of the image" + # which is different from "Source" (2:115) which is for + # the owner of the image. I think "Credit" fits much more + # closely to the commons notion of source than credit does. + 'fileinfotpl_src' => 'Credit', + 'fileinfotpl_art_title' => 'ObjectName', + 'fileinfotpl_book_title' => 'ObjectName', + ); + if ( isset( $attribs['ID'] ) ) { + if ( isset( $nextTdIds[ $attribs['ID'] ] ) ) { + $this->propName = $nextTdIds[ $attribs['ID'] ]; + $this->state = self::STATE_NEXTTD; + $this->tdDepth = 1; + } + } + } + public function elmEnd( $parser, $name ) { + switch ( $this->state ) { + case self::STATE_INITIAL: + break; + case self::STATE_NEXTTD: + if ( $name === 'TD' ) { + $this->tdDepth--; + } + break; + case self::STATE_CAPTURE_TEXT: + if ( $name === 'TD' ) { + $this->tdDepth--; + } + if ( $this->tdDepth <= 0 ) { + $this->state = self::STATE_INITIAL; + if ( $this->langText !== '' ) { + $this->finalProps[ $this->propName ] = Html::rawElement( + 'span', + // FIXME dir too? + array( 'lang' => $this->extractionLang ), + $this->langText + ); + } else { + $this->finalProps[ $this->propName ] = $this->text; + } + $this->langText = ''; + $this->extractionLang = ''; + $this->text = ''; + $this->tdDepth = 0; + } else { + $this->text .= Html::closeElement( $name ); + } + break; + case self::STATE_CAPTURE_LANG: + if ( $name === 'TD' ) { + $this->tdDepth--; + } elseif( $name === 'DIV' ) { + $this->divDepth--; + } + + if ( $name === 'DIV' && $this->divDepth <= 0 ) { + // We are done the lang section + $fallbacks = $this->getFallbacks(); + if ( isset( $fallbacks[ $this->curExtractionLang ] ) ) { + $priority = $fallbacks[ $this->curExtractionLang ]; + } else { + $priority = 1000; + } + if ( $priority < $this->langTextPriority ) { + // This is a more important extraction then previous + $this->langText = $this->curLangText; + $this->extractionLang = $this->curExtractionLang; + $this->langTextPriority = $priority; + } else { + // Throw away this translation. + $this->curLangText = ''; + $this->curExtractionLang = ''; + } + $this->state = self::STATE_CAPTURE_TEXT; + } else { + $this->curLangText .= Html::closeElement( $name ); + } + + break; + } + + + } + + public function char( $parser, $text ) { + // fixme - html escape? + if ( $this->state === self::STATE_CAPTURE_TEXT ) { + $this->text .= $text; + } elseif ( $this->state === self::STATE_CAPTURE_LANG ) { + $this->curLangText .= $text; + } + } + + private function getFallbacks() { + if ( $this->fallbackLangs ) { + return $this->fallbackLangs; + } + + if ( !$this->targetLang ) { + return array(); + } + + $fallbacks = Language::getFallbacksFor( $this->targetLang ); + array_unshift( $fallbacks, $this->targetLang ); + + $this->fallbackLangs = array_flip( $fallbacks ); + return $this->fallbackLangs; + } +} -- To view, visit https://gerrit.wikimedia.org/r/80403 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I5e6bc45f9751641e16426231dabcc8277b86fee0 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CommonsMetadata Gerrit-Branch: master Gerrit-Owner: Brian Wolff <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
