[MediaWiki-commits] [Gerrit] Factor page name normalization out of MediaWikiSite - change (mediawiki/core)

jenkins-bot (Code Review) Mon, 04 Jan 2016 17:40:23 -0800

jenkins-bot has submitted this change and it was merged.

Change subject: Factor page name normalization out of MediaWikiSite
......................................................................



Factor page name normalization out of MediaWikiSite

Into a new MediaWikiPageNameNormalizer.

The code has been copied over almost 1:1, I only
left the phpunit test special handling in MediaWiki site.

Change-Id: I008cadd29a2aa1f21098339b895c35a100959b04
---
M autoload.php
A includes/site/MediaWikiPageNameNormalizer.php
M includes/site/MediaWikiSite.php
A tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php
4 files changed, 292 insertions(+), 136 deletions(-)

Approvals:
  Legoktm: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/autoload.php b/autoload.php
index 46da116..6083ce2 100644
--- a/autoload.php
+++ b/autoload.php
@@ -777,6 +777,7 @@
        'MediaWiki\\Logger\\Monolog\\WikiProcessor' => __DIR__ . 
'/includes/debug/logger/monolog/WikiProcessor.php',
        'MediaWiki\\Logger\\NullSpi' => __DIR__ . 
'/includes/debug/logger/NullSpi.php',
        'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php',
+       'MediaWiki\\Site\\MediaWikiPageNameNormalizer' => __DIR__ . 
'/includes/site/MediaWikiPageNameNormalizer.php',
        'MediaWiki\\Tidy\\Html5Depurate' => __DIR__ . 
'/includes/tidy/Html5Depurate.php',
        'MediaWiki\\Tidy\\RaggettBase' => __DIR__ . 
'/includes/tidy/RaggettBase.php',
        'MediaWiki\\Tidy\\RaggettExternal' => __DIR__ . 
'/includes/tidy/RaggettExternal.php',
diff --git a/includes/site/MediaWikiPageNameNormalizer.php 
b/includes/site/MediaWikiPageNameNormalizer.php
new file mode 100644
index 0000000..f358bd4
--- /dev/null
+++ b/includes/site/MediaWikiPageNameNormalizer.php
@@ -0,0 +1,196 @@
+<?php
+
+namespace MediaWiki\Site;
+
+use FormatJson;
+use Http;
+use UtfNormal\Validator;
+
+/**
+ * Service for normalizing a page name using a MediaWiki api.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.27
+ *
+ * @license GNU GPL v2+
+ * @author John Erling Blad < [email protected] >
+ * @author Daniel Kinzler
+ * @author Jeroen De Dauw < [email protected] >
+ * @author Marius Hoch
+ */
+class MediaWikiPageNameNormalizer {
+
+       /**
+        * Returns the normalized form of the given page title, using the
+        * normalization rules of the given site. If the given title is a 
redirect,
+        * the redirect weill be resolved and the redirect target is returned.
+        *
+        * @note This actually makes an API request to the remote site, so 
beware
+        *   that this function is slow and depends on an external service.
+        *
+        * @see Site::normalizePageName
+        *
+        * @since 1.27
+        *
+        * @param string $pageName
+        * @param string $apiUrl
+        *
+        * @return string
+        * @throws \MWException
+        */
+       public function normalizePageName( $pageName, $apiUrl ) {
+
+               // Check if we have strings as arguments.
+               if ( !is_string( $pageName ) ) {
+                       throw new \MWException( '$pageName must be a string' );
+               }
+
+               // Go on call the external site
+
+               // Make sure the string is normalized into NFC (due to T42017)
+               // but do nothing to the whitespaces, that should work 
appropriately.
+               // @see https://phabricator.wikimedia.org/T42017
+               $pageName = Validator::cleanUp( $pageName );
+
+               // Build the args for the specific call
+               $args = array(
+                       'action' => 'query',
+                       'prop' => 'info',
+                       'redirects' => true,
+                       'converttitles' => true,
+                       'format' => 'json',
+                       'titles' => $pageName,
+                       // @todo options for maxlag and maxage
+                       // Note that maxlag will lead to a long delay before a 
reply is made,
+                       // but that maxage can avoid the extreme delay. On the 
other hand
+                       // maxage could be nice to use anyhow as it stops 
unnecessary requests.
+                       // Also consider smaxage if maxage is used.
+               );
+
+               $url = wfAppendQuery( $apiUrl, $args );
+
+               // Go on call the external site
+               // @todo we need a good way to specify a timeout here.
+               $ret = Http::get( $url, array(), __METHOD__ );
+
+               if ( $ret === false ) {
+                       wfDebugLog( "MediaWikiSite", "call to external site 
failed: $url" );
+                       return false;
+               }
+
+               $data = FormatJson::decode( $ret, true );
+
+               if ( !is_array( $data ) ) {
+                       wfDebugLog( "MediaWikiSite", "call to <$url> returned 
bad json: " . $ret );
+                       return false;
+               }
+
+               $page = static::extractPageRecord( $data, $pageName );
+
+               if ( isset( $page['missing'] ) ) {
+                       wfDebugLog( "MediaWikiSite", "call to <$url> returned a 
marker for a missing page title! "
+                               . $ret );
+                       return false;
+               }
+
+               if ( isset( $page['invalid'] ) ) {
+                       wfDebugLog( "MediaWikiSite", "call to <$url> returned a 
marker for an invalid page title! "
+                               . $ret );
+                       return false;
+               }
+
+               if ( !isset( $page['title'] ) ) {
+                       wfDebugLog( "MediaWikiSite", "call to <$url> did not 
return a page title! " . $ret );
+                       return false;
+               }
+
+               return $page['title'];
+       }
+
+       /**
+        * Get normalization record for a given page title from an API response.
+        *
+        * @param array $externalData A reply from the API on a external server.
+        * @param string $pageTitle Identifies the page at the external site, 
needing normalization.
+        *
+        * @return array|bool A 'page' structure representing the page 
identified by $pageTitle.
+        */
+       private static function extractPageRecord( $externalData, $pageTitle ) {
+               // If there is a special case with only one returned page
+               // we can cheat, and only return
+               // the single page in the "pages" substructure.
+               if ( isset( $externalData['query']['pages'] ) ) {
+                       $pages = array_values( $externalData['query']['pages'] 
);
+                       if ( count( $pages ) === 1 ) {
+                               return $pages[0];
+                       }
+               }
+               // This is only used during internal testing, as it is assumed
+               // a more optimal (and lossfree) storage.
+               // Make initial checks and return if prerequisites are not meet.
+               if ( !is_array( $externalData ) || !isset( 
$externalData['query'] ) ) {
+                       return false;
+               }
+               // Loop over the tree different named structures, that 
otherwise are similar
+               $structs = array(
+                       'normalized' => 'from',
+                       'converted' => 'from',
+                       'redirects' => 'from',
+                       'pages' => 'title'
+               );
+               foreach ( $structs as $listId => $fieldId ) {
+                       // Check if the substructure exist at all.
+                       if ( !isset( $externalData['query'][$listId] ) ) {
+                               continue;
+                       }
+                       // Filter the substructure down to what we actually are 
using.
+                       $collectedHits = array_filter(
+                               array_values( $externalData['query'][$listId] ),
+                               function ( $a ) use ( $fieldId, $pageTitle ) {
+                                       return $a[$fieldId] === $pageTitle;
+                               }
+                       );
+                       // If still looping over normalization, conversion or 
redirects,
+                       // then we need to keep the new page title for later 
rounds.
+                       if ( $fieldId === 'from' && is_array( $collectedHits ) 
) {
+                               switch ( count( $collectedHits ) ) {
+                                       case 0:
+                                               break;
+                                       case 1:
+                                               $pageTitle = 
$collectedHits[0]['to'];
+                                               break;
+                                       default:
+                                               return false;
+                               }
+                       } elseif ( $fieldId === 'title' && is_array( 
$collectedHits ) ) {
+                               // If on the pages structure we should prepare 
for returning.
+
+                               switch ( count( $collectedHits ) ) {
+                                       case 0:
+                                               return false;
+                                       case 1:
+                                               return array_shift( 
$collectedHits );
+                                       default:
+                                               return false;
+                               }
+                       }
+               }
+               // should never be here
+               return false;
+       }
+
+}
diff --git a/includes/site/MediaWikiSite.php b/includes/site/MediaWikiSite.php
index 9fec1f4..0f7e5d7 100644
--- a/includes/site/MediaWikiSite.php
+++ b/includes/site/MediaWikiSite.php
@@ -1,4 +1,7 @@
 <?php
+
+use MediaWiki\Site\MediaWikiPageNameNormalizer;
+
 /**
  * Class representing a MediaWiki site.
  *
@@ -96,13 +99,6 @@
         * @throws MWException
         */
        public function normalizePageName( $pageName ) {
-
-               // Check if we have strings as arguments.
-               if ( !is_string( $pageName ) ) {
-                       throw new MWException( '$pageName must be a string' );
-               }
-
-               // Go on call the external site
                if ( defined( 'MW_PHPUNIT_TEST' ) ) {
                        // If the code is under test, don't call out to other 
sites, just
                        // normalize locally.
@@ -112,139 +108,17 @@
                        $t = Title::newFromText( $pageName );
                        return $t->getPrefixedText();
                } else {
+                       static $mediaWikiPageNameNormalizer = null;
 
-                       // Make sure the string is normalized into NFC (due to 
T42017)
-                       // but do nothing to the whitespaces, that should work 
appropriately.
-                       // @see https://phabricator.wikimedia.org/T42017
-                       $pageName = UtfNormal\Validator::cleanUp( $pageName );
+                       if ( $mediaWikiPageNameNormalizer === null ) {
+                               $mediaWikiPageNameNormalizer = new 
MediaWikiPageNameNormalizer();
+                       }
 
-                       // Build the args for the specific call
-                       $args = array(
-                               'action' => 'query',
-                               'prop' => 'info',
-                               'redirects' => true,
-                               'converttitles' => true,
-                               'format' => 'json',
-                               'titles' => $pageName,
-                               // @todo options for maxlag and maxage
-                               // Note that maxlag will lead to a long delay 
before a reply is made,
-                               // but that maxage can avoid the extreme delay. 
On the other hand
-                               // maxage could be nice to use anyhow as it 
stops unnecessary requests.
-                               // Also consider smaxage if maxage is used.
+                       return $mediaWikiPageNameNormalizer->normalizePageName(
+                               $pageName,
+                               $this->getFileUrl( 'api.php' )
                        );
-
-                       $url = wfAppendQuery( $this->getFileUrl( 'api.php' ), 
$args );
-
-                       // Go on call the external site
-                       // @todo we need a good way to specify a timeout here.
-                       $ret = Http::get( $url, array(), __METHOD__ );
                }
-
-               if ( $ret === false ) {
-                       wfDebugLog( "MediaWikiSite", "call to external site 
failed: $url" );
-                       return false;
-               }
-
-               $data = FormatJson::decode( $ret, true );
-
-               if ( !is_array( $data ) ) {
-                       wfDebugLog( "MediaWikiSite", "call to <$url> returned 
bad json: " . $ret );
-                       return false;
-               }
-
-               $page = static::extractPageRecord( $data, $pageName );
-
-               if ( isset( $page['missing'] ) ) {
-                       wfDebugLog( "MediaWikiSite", "call to <$url> returned a 
marker for a missing page title! "
-                               . $ret );
-                       return false;
-               }
-
-               if ( isset( $page['invalid'] ) ) {
-                       wfDebugLog( "MediaWikiSite", "call to <$url> returned a 
marker for an invalid page title! "
-                               . $ret );
-                       return false;
-               }
-
-               if ( !isset( $page['title'] ) ) {
-                       wfDebugLog( "MediaWikiSite", "call to <$url> did not 
return a page title! " . $ret );
-                       return false;
-               }
-
-               return $page['title'];
-       }
-
-       /**
-        * Get normalization record for a given page title from an API response.
-        *
-        * @since 1.21
-        *
-        * @param array $externalData A reply from the API on a external server.
-        * @param string $pageTitle Identifies the page at the external site, 
needing normalization.
-        *
-        * @return array|bool A 'page' structure representing the page 
identified by $pageTitle.
-        */
-       private static function extractPageRecord( $externalData, $pageTitle ) {
-               // If there is a special case with only one returned page
-               // we can cheat, and only return
-               // the single page in the "pages" substructure.
-               if ( isset( $externalData['query']['pages'] ) ) {
-                       $pages = array_values( $externalData['query']['pages'] 
);
-                       if ( count( $pages ) === 1 ) {
-                               return $pages[0];
-                       }
-               }
-               // This is only used during internal testing, as it is assumed
-               // a more optimal (and lossfree) storage.
-               // Make initial checks and return if prerequisites are not meet.
-               if ( !is_array( $externalData ) || !isset( 
$externalData['query'] ) ) {
-                       return false;
-               }
-               // Loop over the tree different named structures, that 
otherwise are similar
-               $structs = array(
-                       'normalized' => 'from',
-                       'converted' => 'from',
-                       'redirects' => 'from',
-                       'pages' => 'title'
-               );
-               foreach ( $structs as $listId => $fieldId ) {
-                       // Check if the substructure exist at all.
-                       if ( !isset( $externalData['query'][$listId] ) ) {
-                               continue;
-                       }
-                       // Filter the substructure down to what we actually are 
using.
-                       $collectedHits = array_filter(
-                               array_values( $externalData['query'][$listId] ),
-                               function ( $a ) use ( $fieldId, $pageTitle ) {
-                                       return $a[$fieldId] === $pageTitle;
-                               }
-                       );
-                       // If still looping over normalization, conversion or 
redirects,
-                       // then we need to keep the new page title for later 
rounds.
-                       if ( $fieldId === 'from' && is_array( $collectedHits ) 
) {
-                               switch ( count( $collectedHits ) ) {
-                                       case 0:
-                                               break;
-                                       case 1:
-                                               $pageTitle = 
$collectedHits[0]['to'];
-                                               break;
-                                       default:
-                                               return false;
-                               }
-                       } elseif ( $fieldId === 'title' && is_array( 
$collectedHits ) ) {
-                               // If on the pages structure we should prepare 
for returning.
-                               switch ( count( $collectedHits ) ) {
-                                       case 0:
-                                               return false;
-                                       case 1:
-                                               return array_shift( 
$collectedHits );
-                                       default:
-                                               return false;
-                               }
-                       }
-               }
-               // should never be here
-               return false;
        }
 
        /**
diff --git a/tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php 
b/tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php
new file mode 100644
index 0000000..163c52d
--- /dev/null
+++ b/tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php
@@ -0,0 +1,85 @@
+<?php
+
+use MediaWiki\Site\MediaWikiPageNameNormalizer;
+
+/**
+ * @covers MediaWiki\Site\MediaWikiPageNameNormalizer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.27
+ *
+ * @group Site
+ * @group medium
+ *
+ * @author Marius Hoch
+ */
+class MediaWikiPageNameNormalizerTest extends PHPUnit_Framework_TestCase {
+
+       protected function setUp() {
+               parent::setUp();
+
+               static $connectivity = null;
+
+               if ( $connectivity === null ) {
+                       // Check whether we have (reasonable fast) connectivity
+                       $res = Http::get(
+                               
'https://www.wikidata.org/w/api.php?action=query&meta=siteinfo&format=json',
+                               array( 'timeout' => 3 ),
+                               __METHOD__
+                       );
+
+                       if ( $res === false || strpos( $res, 
'"sitename":"Wikidata"' ) === false ) {
+                               $connectivity = false;
+                       } else {
+                               $connectivity = true;
+                       }
+               }
+
+               if ( !$connectivity ) {
+                       $this->markTestSkipped( 
'MediaWikiPageNameNormalizerTest needs internet connectivity.' );
+               }
+       }
+
+       /**
+        * @dataProvider normalizePageTitleProvider
+        */
+       public function testNormalizePageTitle( $expected, $pageName ) {
+               $normalizer = new MediaWikiPageNameNormalizer();
+
+               $this->assertSame(
+                       $expected,
+                       $normalizer->normalizePageName( $pageName, 
'https://www.wikidata.org/w/api.php' )
+               );
+       }
+
+       public function normalizePageTitleProvider() {
+               // Note: This makes (very conservative) assumptions about pages 
on Wikidata
+               // existing or not.
+               return array(
+                       'universe (Q1)' => array(
+                               'Q1', 'Q1'
+                       ),
+                       'Q404 redirects to Q395' => array(
+                               'Q395', 'Q404'
+                       ),
+                       'there is no Q0' => array(
+                               false, 'Q0'
+                       )
+               );
+       }
+
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/260255
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I008cadd29a2aa1f21098339b895c35a100959b04
Gerrit-PatchSet: 7
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Hoo man <[email protected]>
Gerrit-Reviewer: Addshore <[email protected]>
Gerrit-Reviewer: Aude <[email protected]>
Gerrit-Reviewer: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Hoo man <[email protected]>
Gerrit-Reviewer: Legoktm <[email protected]>
Gerrit-Reviewer: MaxSem <[email protected]>
Gerrit-Reviewer: Reedy <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Factor page name normalization out of MediaWikiSite - change (mediawiki/core)

Reply via email to