Hoo man has uploaded a new change for review.
https://gerrit.wikimedia.org/r/260255
Change subject: Factor page name normalization out of MediaWikiSite
......................................................................
Factor page name normalization out of MediaWikiSite
Into a new MediaWikiPageNameNormalizer.
The code has been copied over almost 1:1, I only
left the phpunit test special handling in MediaWiki site.
Change-Id: I008cadd29a2aa1f21098339b895c35a100959b04
---
M autoload.php
A includes/site/MediaWikiPageNameNormalizer.php
M includes/site/MediaWikiSite.php
A tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php
4 files changed, 318 insertions(+), 140 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core
refs/changes/55/260255/1
diff --git a/autoload.php b/autoload.php
index f546201..7628da1 100644
--- a/autoload.php
+++ b/autoload.php
@@ -329,6 +329,7 @@
'DeletedContributionsPage' => __DIR__ .
'/includes/specials/SpecialDeletedContributions.php',
'DependencyWrapper' => __DIR__ . '/includes/cache/CacheDependency.php',
'DeprecatedGlobal' => __DIR__ . '/includes/DeprecatedGlobal.php',
+ 'DeprecatedInterfaceFinder' => __DIR__ .
'/maintenance/findDeprecated.php',
'DerivativeContext' => __DIR__ .
'/includes/context/DerivativeContext.php',
'DerivativeRequest' => __DIR__ . '/includes/WebRequest.php',
'DerivativeResourceLoaderContext' => __DIR__ .
'/includes/resourceloader/DerivativeResourceLoaderContext.php',
@@ -423,6 +424,7 @@
'FewestrevisionsPage' => __DIR__ .
'/includes/specials/SpecialFewestrevisions.php',
'Field' => __DIR__ . '/includes/db/DatabaseUtility.php',
'File' => __DIR__ . '/includes/filerepo/file/File.php',
+ 'FileAwareNodeVisitor' => __DIR__ . '/maintenance/findDeprecated.php',
'FileBackend' => __DIR__ . '/includes/filebackend/FileBackend.php',
'FileBackendDBRepoWrapper' => __DIR__ .
'/includes/filerepo/FileBackendDBRepoWrapper.php',
'FileBackendError' => __DIR__ . '/includes/filebackend/FileBackend.php',
@@ -445,6 +447,7 @@
'FileOpBatch' => __DIR__ . '/includes/filebackend/FileOpBatch.php',
'FileRepo' => __DIR__ . '/includes/filerepo/FileRepo.php',
'FileRepoStatus' => __DIR__ . '/includes/filerepo/FileRepoStatus.php',
+ 'FindDeprecated' => __DIR__ . '/maintenance/findDeprecated.php',
'FindHooks' => __DIR__ . '/maintenance/findHooks.php',
'FindMissingFiles' => __DIR__ . '/maintenance/findMissingFiles.php',
'FindOrphanedFiles' => __DIR__ . '/maintenance/findOrphanedFiles.php',
@@ -773,6 +776,7 @@
'MediaWiki\\Logger\\Monolog\\WikiProcessor' => __DIR__ .
'/includes/debug/logger/monolog/WikiProcessor.php',
'MediaWiki\\Logger\\NullSpi' => __DIR__ .
'/includes/debug/logger/NullSpi.php',
'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php',
+ 'MediaWiki\\Site\\MediaWikiPageNameNormalizer' => __DIR__ .
'/includes/site/MediaWikiPageNameNormalizer.php',
'MediaWiki\\Tidy\\Html5Depurate' => __DIR__ .
'/includes/tidy/Html5Depurate.php',
'MediaWiki\\Tidy\\RaggettBase' => __DIR__ .
'/includes/tidy/RaggettBase.php',
'MediaWiki\\Tidy\\RaggettExternal' => __DIR__ .
'/includes/tidy/RaggettExternal.php',
diff --git a/includes/site/MediaWikiPageNameNormalizer.php
b/includes/site/MediaWikiPageNameNormalizer.php
new file mode 100644
index 0000000..aee0366
--- /dev/null
+++ b/includes/site/MediaWikiPageNameNormalizer.php
@@ -0,0 +1,217 @@
+<?php
+
+namespace MediaWiki\Site;
+
+use FormatJson;
+use Http;
+use UtfNormal\Validator;
+
+/**
+ * Class representing a MediaWiki site.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Site
+ * @license GNU GPL v2+
+ * @author John Erling Blad < [email protected] >
+ * @author Daniel Kinzler
+ * @author Jeroen De Dauw < [email protected] >
+ * @author Marius Hoch
+ */
+
+/**
+ * Service for normalizing a page name using a MediaWiki api.
+ *
+ * @since 1.27
+ *
+ * @ingroup Site
+ */
+class MediaWikiPageNameNormalizer {
+
+ /**
+ * @var string
+ */
+ private $apiUrl;
+
+ /**
+ * @since 1.27
+ *
+ * @param string $apiUrl
+ */
+ public function __construct( $apiUrl ) {
+ $this->apiUrl = $apiUrl;
+ }
+
+ /**
+ * Returns the normalized form of the given page title, using the
+ * normalization rules of the given site. If the given title is a
redirect,
+ * the redirect weill be resolved and the redirect target is returned.
+ *
+ * @note This actually makes an API request to the remote site, so
beware
+ * that this function is slow and depends on an external service.
+ *
+ * @see Site::normalizePageName
+ *
+ * @since 1.27
+ *
+ * @param string $pageName
+ *
+ * @return string
+ * @throws MWException
+ */
+ public function normalizePageName( $pageName ) {
+
+ // Check if we have strings as arguments.
+ if ( !is_string( $pageName ) ) {
+ throw new MWException( '$pageName must be a string' );
+ }
+
+ // Go on call the external site
+
+ // Make sure the string is normalized into NFC (due to T42017)
+ // but do nothing to the whitespaces, that should work
appropriately.
+ // @see https://phabricator.wikimedia.org/T42017
+ $pageName = Validator::cleanUp( $pageName );
+
+ // Build the args for the specific call
+ $args = array(
+ 'action' => 'query',
+ 'prop' => 'info',
+ 'redirects' => true,
+ 'converttitles' => true,
+ 'format' => 'json',
+ 'titles' => $pageName,
+ // @todo options for maxlag and maxage
+ // Note that maxlag will lead to a long delay before a
reply is made,
+ // but that maxage can avoid the extreme delay. On the
other hand
+ // maxage could be nice to use anyhow as it stops
unnecessary requests.
+ // Also consider smaxage if maxage is used.
+ );
+
+ $url = wfAppendQuery( $this->apiUrl, $args );
+
+ // Go on call the external site
+ // @todo we need a good way to specify a timeout here.
+ $ret = Http::get( $url, array(), __METHOD__ );
+
+ if ( $ret === false ) {
+ wfDebugLog( "MediaWikiSite", "call to external site
failed: $url" );
+ return false;
+ }
+
+ $data = FormatJson::decode( $ret, true );
+
+ if ( !is_array( $data ) ) {
+ wfDebugLog( "MediaWikiSite", "call to <$url> returned
bad json: " . $ret );
+ return false;
+ }
+
+ $page = static::extractPageRecord( $data, $pageName );
+
+ if ( isset( $page['missing'] ) ) {
+ wfDebugLog( "MediaWikiSite", "call to <$url> returned a
marker for a missing page title! "
+ . $ret );
+ return false;
+ }
+
+ if ( isset( $page['invalid'] ) ) {
+ wfDebugLog( "MediaWikiSite", "call to <$url> returned a
marker for an invalid page title! "
+ . $ret );
+ return false;
+ }
+
+ if ( !isset( $page['title'] ) ) {
+ wfDebugLog( "MediaWikiSite", "call to <$url> did not
return a page title! " . $ret );
+ return false;
+ }
+
+ return $page['title'];
+ }
+
+ /**
+ * Get normalization record for a given page title from an API response.
+ *
+ * @param array $externalData A reply from the API on a external server.
+ * @param string $pageTitle Identifies the page at the external site,
needing normalization.
+ *
+ * @return array|bool A 'page' structure representing the page
identified by $pageTitle.
+ */
+ private static function extractPageRecord( $externalData, $pageTitle ) {
+ // If there is a special case with only one returned page
+ // we can cheat, and only return
+ // the single page in the "pages" substructure.
+ if ( isset( $externalData['query']['pages'] ) ) {
+ $pages = array_values( $externalData['query']['pages']
);
+ if ( count( $pages ) === 1 ) {
+ return $pages[0];
+ }
+ }
+ // This is only used during internal testing, as it is assumed
+ // a more optimal (and lossfree) storage.
+ // Make initial checks and return if prerequisites are not meet.
+ if ( !is_array( $externalData ) || !isset(
$externalData['query'] ) ) {
+ return false;
+ }
+ // Loop over the tree different named structures, that
otherwise are similar
+ $structs = array(
+ 'normalized' => 'from',
+ 'converted' => 'from',
+ 'redirects' => 'from',
+ 'pages' => 'title'
+ );
+ foreach ( $structs as $listId => $fieldId ) {
+ // Check if the substructure exist at all.
+ if ( !isset( $externalData['query'][$listId] ) ) {
+ continue;
+ }
+ // Filter the substructure down to what we actually are
using.
+ $collectedHits = array_filter(
+ array_values( $externalData['query'][$listId] ),
+ function ( $a ) use ( $fieldId, $pageTitle ) {
+ return $a[$fieldId] === $pageTitle;
+ }
+ );
+ // If still looping over normalization, conversion or
redirects,
+ // then we need to keep the new page title for later
rounds.
+ if ( $fieldId === 'from' && is_array( $collectedHits )
) {
+ switch ( count( $collectedHits ) ) {
+ case 0:
+ break;
+ case 1:
+ $pageTitle =
$collectedHits[0]['to'];
+ break;
+ default:
+ return false;
+ }
+ }
+ // If on the pages structure we should prepare for
returning.
+ elseif ( $fieldId === 'title' && is_array(
$collectedHits ) ) {
+ switch ( count( $collectedHits ) ) {
+ case 0:
+ return false;
+ case 1:
+ return array_shift(
$collectedHits );
+ default:
+ return false;
+ }
+ }
+ }
+ // should never be here
+ return false;
+ }
+
+}
diff --git a/includes/site/MediaWikiSite.php b/includes/site/MediaWikiSite.php
index 029919c..d431a4c 100644
--- a/includes/site/MediaWikiSite.php
+++ b/includes/site/MediaWikiSite.php
@@ -1,4 +1,7 @@
<?php
+
+use MediaWiki\Site\MediaWikiPageNameNormalizer;
+
/**
* Class representing a MediaWiki site.
*
@@ -37,6 +40,11 @@
const PATH_PAGE = 'page_path';
/**
+ * @var MediaWikiPageNameNormalizer
+ */
+ private $mediaWikiPageNameNormalizer;
+
+ /**
* @since 1.21
* @deprecated since 1.21 Just use the constructor or the factory
Site::newForType
*
@@ -59,6 +67,8 @@
*/
public function __construct( $type = self::TYPE_MEDIAWIKI ) {
parent::__construct( $type );
+
+ $this->mediaWikiPageNameNormalizer = new
MediaWikiPageNameNormalizer( $this->getFileUrl( 'api.php' ) );
}
/**
@@ -96,13 +106,6 @@
* @throws MWException
*/
public function normalizePageName( $pageName ) {
-
- // Check if we have strings as arguments.
- if ( !is_string( $pageName ) ) {
- throw new MWException( '$pageName must be a string' );
- }
-
- // Go on call the external site
if ( defined( 'MW_PHPUNIT_TEST' ) ) {
// If the code is under test, don't call out to other
sites, just
// normalize locally.
@@ -112,140 +115,8 @@
$t = Title::newFromText( $pageName );
return $t->getPrefixedText();
} else {
-
- // Make sure the string is normalized into NFC (due to
T42017)
- // but do nothing to the whitespaces, that should work
appropriately.
- // @see https://phabricator.wikimedia.org/T42017
- $pageName = UtfNormal\Validator::cleanUp( $pageName );
-
- // Build the args for the specific call
- $args = array(
- 'action' => 'query',
- 'prop' => 'info',
- 'redirects' => true,
- 'converttitles' => true,
- 'format' => 'json',
- 'titles' => $pageName,
- // @todo options for maxlag and maxage
- // Note that maxlag will lead to a long delay
before a reply is made,
- // but that maxage can avoid the extreme delay.
On the other hand
- // maxage could be nice to use anyhow as it
stops unnecessary requests.
- // Also consider smaxage if maxage is used.
- );
-
- $url = wfAppendQuery( $this->getFileUrl( 'api.php' ),
$args );
-
- // Go on call the external site
- // @todo we need a good way to specify a timeout here.
- $ret = Http::get( $url, array(), __METHOD__ );
+ return
$this->mediaWikiPageNameNormalizer->normalizePageName( $pageName );
}
-
- if ( $ret === false ) {
- wfDebugLog( "MediaWikiSite", "call to external site
failed: $url" );
- return false;
- }
-
- $data = FormatJson::decode( $ret, true );
-
- if ( !is_array( $data ) ) {
- wfDebugLog( "MediaWikiSite", "call to <$url> returned
bad json: " . $ret );
- return false;
- }
-
- $page = static::extractPageRecord( $data, $pageName );
-
- if ( isset( $page['missing'] ) ) {
- wfDebugLog( "MediaWikiSite", "call to <$url> returned a
marker for a missing page title! "
- . $ret );
- return false;
- }
-
- if ( isset( $page['invalid'] ) ) {
- wfDebugLog( "MediaWikiSite", "call to <$url> returned a
marker for an invalid page title! "
- . $ret );
- return false;
- }
-
- if ( !isset( $page['title'] ) ) {
- wfDebugLog( "MediaWikiSite", "call to <$url> did not
return a page title! " . $ret );
- return false;
- }
-
- return $page['title'];
- }
-
- /**
- * Get normalization record for a given page title from an API response.
- *
- * @since 1.21
- *
- * @param array $externalData A reply from the API on a external server.
- * @param string $pageTitle Identifies the page at the external site,
needing normalization.
- *
- * @return array|bool A 'page' structure representing the page
identified by $pageTitle.
- */
- private static function extractPageRecord( $externalData, $pageTitle ) {
- // If there is a special case with only one returned page
- // we can cheat, and only return
- // the single page in the "pages" substructure.
- if ( isset( $externalData['query']['pages'] ) ) {
- $pages = array_values( $externalData['query']['pages']
);
- if ( count( $pages ) === 1 ) {
- return $pages[0];
- }
- }
- // This is only used during internal testing, as it is assumed
- // a more optimal (and lossfree) storage.
- // Make initial checks and return if prerequisites are not meet.
- if ( !is_array( $externalData ) || !isset(
$externalData['query'] ) ) {
- return false;
- }
- // Loop over the tree different named structures, that
otherwise are similar
- $structs = array(
- 'normalized' => 'from',
- 'converted' => 'from',
- 'redirects' => 'from',
- 'pages' => 'title'
- );
- foreach ( $structs as $listId => $fieldId ) {
- // Check if the substructure exist at all.
- if ( !isset( $externalData['query'][$listId] ) ) {
- continue;
- }
- // Filter the substructure down to what we actually are
using.
- $collectedHits = array_filter(
- array_values( $externalData['query'][$listId] ),
- function ( $a ) use ( $fieldId, $pageTitle ) {
- return $a[$fieldId] === $pageTitle;
- }
- );
- // If still looping over normalization, conversion or
redirects,
- // then we need to keep the new page title for later
rounds.
- if ( $fieldId === 'from' && is_array( $collectedHits )
) {
- switch ( count( $collectedHits ) ) {
- case 0:
- break;
- case 1:
- $pageTitle =
$collectedHits[0]['to'];
- break;
- default:
- return false;
- }
- }
- // If on the pages structure we should prepare for
returning.
- elseif ( $fieldId === 'title' && is_array(
$collectedHits ) ) {
- switch ( count( $collectedHits ) ) {
- case 0:
- return false;
- case 1:
- return array_shift(
$collectedHits );
- default:
- return false;
- }
- }
- }
- // should never be here
- return false;
}
/**
diff --git a/tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php
b/tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php
new file mode 100644
index 0000000..c571ff5
--- /dev/null
+++ b/tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php
@@ -0,0 +1,86 @@
+<?php
+
+use MediaWiki\Site\MediaWikiPageNameNormalizer;
+
+/**
+ * Tests for the MediaWikiSite class.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @since 1.27
+ *
+ * @ingroup Site
+ * @ingroup Test
+ *
+ * @group Site
+ * @group medium
+ *
+ * @author Marius Hoch
+ */
+class MediaWikiPageNameNormalizerTest extends PHPUnit_Framework_TestCase {
+
+ protected function setUp() {
+ parent::setUp();
+
+ static $connectivity = null;
+
+ if ( $connectivity === null ) {
+ // Check whether we have (reasonable fast) connectivity
+ $res = Http::get(
+
'https://www.wikidata.org/w/api.php?action=query&meta=siteinfo&format=json',
+ array( 'timeout' => 3 ),
+ __METHOD__
+ );
+
+ if ( $res === false || strpos( $res,
'"sitename":"Wikidata"' ) === false ) {
+ $connectivity = false;
+ } else {
+ $connectivity = true;
+ }
+ }
+
+ if ( !$connectivity ) {
+ $this->markTestSkipped(
'MediaWikiPageNameNormalizerTest needs internet connectivity.' );
+ }
+ }
+
+ /**
+ * @dataProvider normalizePageTitleProvider
+ */
+ public function testNormalizePageTitle( $expected, $pageName ) {
+ $normalizer = new MediaWikiPageNameNormalizer(
'https://www.wikidata.org/w/api.php' );
+
+ $this->assertSame( $expected, $normalizer->normalizePageName(
$pageName ) );
+ }
+
+ public function normalizePageTitleProvider() {
+ // Note: This makes (very conservative) assumptions about pages
on Wikidata
+ // existing or not.
+ return array(
+ 'universe (Q1)' => array(
+ 'Q1', 'Q1'
+ ),
+ 'Q404 redirects to Q395' => array(
+ 'Q395', 'Q404'
+ ),
+ 'there is no Q0' => array(
+ false, 'Q0'
+ )
+ );
+ }
+
+}
--
To view, visit https://gerrit.wikimedia.org/r/260255
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I008cadd29a2aa1f21098339b895c35a100959b04
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Hoo man <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits