Brian Wolff has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/80403


Change subject: Extension to screen scrape metadata out of commons
......................................................................

Extension to screen scrape metadata out of commons

Initial commit. I know the patch is rough in a couple
places, and it needs caching.

for use with https://gerrit.wikimedia.org/r/#/c/78926/

Change-Id: I5e6bc45f9751641e16426231dabcc8277b86fee0
---
A CommonsMetadata.i18n.php
A CommonsMetadata.php
A CommonsMetadata_body.php
3 files changed, 295 insertions(+), 0 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CommonsMetadata 
refs/changes/03/80403/1

diff --git a/CommonsMetadata.i18n.php b/CommonsMetadata.i18n.php
new file mode 100644
index 0000000..6f5a756
--- /dev/null
+++ b/CommonsMetadata.i18n.php
@@ -0,0 +1,20 @@
+<?php
+/**
+ * Internationalisation file for extension CommonsMetadata.
+ *
+ * @file
+ * @ingroup Extensions
+ */
+
+$messages= array();
+
+$messages['en'] = array(
+       'commonsmetadata-desc' => 'Extends the "extmetadata" prop of the image 
info API module to include information stored in image description pages that 
use the templates commonly used on Wikimedia Commons.',
+);
+
+/** Message documentation (Message documentation)
+ * @author Bawolff
+ */
+$messages['qqq'] = array(
+       'commonsmetadata-desc' => '{{desc}}',
+);
diff --git a/CommonsMetadata.php b/CommonsMetadata.php
new file mode 100644
index 0000000..0570e3d
--- /dev/null
+++ b/CommonsMetadata.php
@@ -0,0 +1,31 @@
+<?php
+# Extends the extmetadata propery of image info API module to include
+# details from file description pages that use commons style templates.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+$wgExtensionCredits['other'][] = array(
+       'path' => __FILE__,
+       'name' => 'CommonsMetadata',
+       'author' => 'Brian Wolff',
+       'url' => '//www.mediawiki.org/wiki/Extension:CommonsMetadata',
+       'descriptionmsg' => 'commonsmetadata-desc',
+);
+$wgAutoloadClasses['CommonsMeta'] = __DIR__ . '/CommonsMetadata_body.php';
+$wgExtensionMessagesFiles['CommonsMetadata'] =  __DIR__ . 
'/CommonsMetadata.i18n.php';
+
+$wgHooks['GetExtendedMetadata'][] = 'CommonsMeta::onGetExtendedMetadata';
+
diff --git a/CommonsMetadata_body.php b/CommonsMetadata_body.php
new file mode 100644
index 0000000..5cd1c7f
--- /dev/null
+++ b/CommonsMetadata_body.php
@@ -0,0 +1,244 @@
+<?php
+class CommonsMeta {
+
+       private $xmlParser;
+       private $state = self::STATE_INITIAL;
+       private $text;
+       private $propName;
+       private $tdDepth = 0;
+       private $divDepth = 0;
+       private $finalProps = array();
+       private $curExtractionLang = '';
+       private $extractionLang;
+       private $langText = '';
+       private $curLangText = '';
+       private $fallbackLangs;
+       private $targetLang = false; // false for all, language code otherwise.
+       private $langTextPriority = 2000;
+
+       const STATE_INITIAL = 1;
+       const STATE_NEXTTD = 2;
+       const STATE_CAPTURE_TEXT = 3;
+       const STATE_CAPTURE_LANG = 4;
+
+       public static function getMetadata( $doc, $lang = false ) {
+               $obj = new CommonsMeta;
+               $obj->setLanguage( $lang );
+               return $obj->parsePage( $doc );
+       }
+
+       public static function onGetExtendedMetadata( $combinedMeta, $file, 
$context, $singleLang ) {
+               $lang = $context->getLanguage()->getCode();
+               $query = array( 'action' => 'render', 'uselang' => $lang );
+               $descriptionUrl = $file->getDescriptionUrl();
+               $descriptionUrl = wfAppendQuery( $descriptionUrl, $query );
+               $descriptionUrl = wfExpandUrl( $descriptionUrl, PROTO_CANONICAL 
);
+
+               $descriptionText = Http::get( $descriptionUrl );
+
+               if ( $singleLang ) {
+                       $data = self::getMetadata( $descriptionText, $lang );
+               } else {
+                       $data = self::getMetadata( $descriptionText );
+               }
+
+               foreach( $data as $name => $value ) {
+                       $combinedMeta[ $name ] = array(
+                               'value' => $value,
+                               'source' => 'commons-desc-page'
+                       );
+               }
+
+               return true;
+       }
+
+       /**
+        * Set the language to fetch
+        * @param String|boolean $lang Language code or false for all langs.
+        */
+       public function setLanguage( $lang ) {
+               if ( !Language::isValidCode( $lang ) ) {
+                       // Maybe do more strict isValidBuiltinCode?
+                       throw new MWException( 'Invalid language code 
specified' );
+               }
+               $this->targetLang = $lang;
+       }
+
+       function __construct() {
+               if ( !function_exists( 'xml_parser_create' ) ) {
+                       throw new MWException( 'No XML parser support' );
+               }
+               $this->xmlParser = xml_parser_create( 'UTF-8' );
+               if ( !$this->xmlParser ) {
+                       throw new MWException( 'Could not create parser' );
+               }
+               xml_set_character_data_handler( $this->xmlParser, array( $this, 
'char' ) );
+               xml_set_element_handler( $this->xmlParser, array( $this, 
'elmStart' ), array( $this, 'elmEnd' ) );
+       }
+
+       public function parsePage( $html ) {
+               xml_parse( $this->xmlParser, "<renderout>$html</renderout>" );
+               return $this->finalProps;
+       }
+
+       public function elmStart( $parser, $name, array $attribs ) {
+               switch ( $this->state ) {
+                       case self::STATE_INITIAL:
+                               if ( $name === 'TD' ) {
+                                       $this->elmStartInitial( $attribs );
+                               }
+                               break;
+                       case self::STATE_NEXTTD:
+                               if ( $name === 'TD' ) {
+                                       if ( $this->tdDepth <= 0 ) {
+                                               $this->state = 
self::STATE_CAPTURE_TEXT;
+                                               $this->tdDepth = 1;
+                                               $this->text = '';
+                                               $this->langText = '';
+                                               $this->extractionLang = '';
+                                       } else {
+                                               $this->tdDepth++;
+                                       }
+                               }
+                               break;
+                       case self::STATE_CAPTURE_TEXT:
+                               if ( $name === 'TD' ) {
+                                       $this->tdDepth++;
+                               }
+                               // FIXME inline language tags.
+                               if ( $this->targetLang && $name === 'DIV' && 
isset( $attribs['CLASS'] ) && isset( $attribs['LANG'] ) ) {
+                                       if ( preg_match( 
'/(?:^|\s)description(?:\s|$)/', $attribs['CLASS'] ) ) {
+                                               $this->state = 
self::STATE_CAPTURE_LANG;
+                                               $this->curExtractionLang = 
$attribs['LANG'];
+                                               $this->curLangText = '';
+                                               $this->divDepth = 1;
+                                       }
+                               }
+                               $this->text .= Html::openElement( $name, 
$attribs );
+                               break;
+                       case self::STATE_CAPTURE_LANG:
+                               if ( $name === 'TD' ) {
+                                       $this->tdDepth++;
+                               } elseif( $name === 'DIV' ) {
+                                       $this->divDepth++;
+                               }
+                               $this->curLangText .= Html::openElement( $name, 
$attribs );
+                               break;
+               }
+       }
+
+       public function elmStartInitial( $attribs ) {
+               $nextTdIds = array(
+                       'fileinfotpl_desc' => 'ImageDescription',
+                       'fileinfotpl_date' => 'DateTimeOriginal',
+                       'fileinfotpl_aut' => 'Artist',
+                       # Credit (iptc 2:110) describes the "provider of the 
image"
+                       # which is different from "Source" (2:115) which is for
+                       # the owner of the image. I think "Credit" fits much 
more
+                       # closely to the commons notion of source than credit 
does.
+                       'fileinfotpl_src' => 'Credit',
+                       'fileinfotpl_art_title' => 'ObjectName',
+                       'fileinfotpl_book_title' => 'ObjectName',
+               );
+               if ( isset( $attribs['ID'] ) ) {
+                       if ( isset( $nextTdIds[ $attribs['ID'] ] ) ) {
+                               $this->propName = $nextTdIds[ $attribs['ID'] ];
+                               $this->state = self::STATE_NEXTTD;
+                               $this->tdDepth = 1;
+                       }
+               }
+       }
+       public function elmEnd( $parser, $name ) {
+               switch ( $this->state ) {
+                       case self::STATE_INITIAL:
+                               break;
+                       case self::STATE_NEXTTD:
+                               if ( $name === 'TD' ) {
+                                       $this->tdDepth--;
+                               }
+                               break;
+                       case self::STATE_CAPTURE_TEXT:
+                               if ( $name === 'TD' ) {
+                                       $this->tdDepth--;
+                               }
+                               if ( $this->tdDepth <= 0 ) {
+                                       $this->state = self::STATE_INITIAL;
+                                       if ( $this->langText !== '' ) {
+                                               $this->finalProps[ 
$this->propName ] = Html::rawElement(
+                                                       'span',
+                                                       // FIXME dir too?
+                                                       array( 'lang' => 
$this->extractionLang ),
+                                                       $this->langText
+                                               );
+                                       } else {
+                                               $this->finalProps[ 
$this->propName ] = $this->text;
+                                       }
+                                       $this->langText = '';
+                                       $this->extractionLang = '';
+                                       $this->text = '';
+                                       $this->tdDepth = 0;     
+                               } else {
+                                       $this->text .= Html::closeElement( 
$name );
+                               }
+                               break;
+                       case self::STATE_CAPTURE_LANG:
+                               if ( $name === 'TD' ) {
+                                       $this->tdDepth--;
+                               } elseif( $name === 'DIV' ) {
+                                       $this->divDepth--;
+                               }
+
+                               if ( $name === 'DIV' && $this->divDepth <= 0 ) {
+                                       // We are done the lang section
+                                       $fallbacks = $this->getFallbacks();
+                                       if ( isset( $fallbacks[ 
$this->curExtractionLang ] ) ) {
+                                               $priority = $fallbacks[ 
$this->curExtractionLang ];
+                                       } else {
+                                               $priority = 1000;
+                                       }
+                                       if ( $priority < 
$this->langTextPriority ) {
+                                               // This is a more important 
extraction then previous
+                                               $this->langText = 
$this->curLangText;
+                                               $this->extractionLang = 
$this->curExtractionLang;
+                                               $this->langTextPriority = 
$priority;
+                                       } else {
+                                               // Throw away this translation.
+                                               $this->curLangText = '';
+                                               $this->curExtractionLang = '';
+                                       }
+                                       $this->state = self::STATE_CAPTURE_TEXT;
+                               } else {
+                                       $this->curLangText .= 
Html::closeElement( $name );
+                               }
+
+                               break;
+               }
+
+
+       }
+
+       public function char( $parser, $text ) {
+               // fixme - html escape?
+               if ( $this->state === self::STATE_CAPTURE_TEXT ) {
+                       $this->text .= $text;
+               } elseif ( $this->state === self::STATE_CAPTURE_LANG ) {
+                       $this->curLangText .= $text;
+               }
+       }
+
+       private function getFallbacks() {
+               if ( $this->fallbackLangs ) {
+                       return $this->fallbackLangs;
+               }
+
+               if ( !$this->targetLang ) {
+                       return array();
+               }
+
+               $fallbacks = Language::getFallbacksFor( $this->targetLang );
+               array_unshift( $fallbacks, $this->targetLang );
+
+               $this->fallbackLangs = array_flip( $fallbacks );
+               return $this->fallbackLangs;
+       }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/80403
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5e6bc45f9751641e16426231dabcc8277b86fee0
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CommonsMetadata
Gerrit-Branch: master
Gerrit-Owner: Brian Wolff <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to