TTO has uploaded a new change for review.
https://gerrit.wikimedia.org/r/149293
Change subject: Proper namespace handling for WikiImporter
......................................................................
Proper namespace handling for WikiImporter
Up until now, the import backend has tried to resolve titles in the XML
data using the regular Title class. This is a disastrous idea, as local
namespace names often do not match foreign namespace titles.
There is enough metadata present in XML dumps generated by modern MW
versions for the target namespace ID and name to be reliably determined.
This metadata is contained in the <siteinfo> and <ns> tags, which
(unbelievably enough) was totally ignored by WikiImporter until now.
Fallbacks are provided for older XML dump versions which may be missing
some or all of this metadata.
The ForeignTitle class is introduced. This is intended specifically for
the resolution of titles on foreign wikis. It would have been possible to
extend the existing TitleValue infrastructure for this, but TitleValue
contains a fragment (not required for import) and does not hold namespace
ID alongside name. Additionally, the separation into Value/Formatter/Parser
is overkill for this simple application.
The 'AfterImportPage' hook has been modified so the second argument is a
ForeignTitle object instead of a Title (the documentation was wrong, it
was never a string). LiquidThreads, SMW and FacetedSearch all use this
hook but none of them use the $origTitle parameter.
Also clarifying some error messages.
Bug: 30723
Bug: 40192
Change-Id: Iaa58e1b9fd7287cdf999cef6a6f3bb63cd2a4778
---
M includes/AutoLoader.php
M includes/Import.php
M includes/specials/SpecialImport.php
M languages/i18n/en.json
M tests/phpunit/includes/ImportTest.php
5 files changed, 223 insertions(+), 49 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core
refs/changes/93/149293/1
diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php
index b3b51ed..c79c567 100644
--- a/includes/AutoLoader.php
+++ b/includes/AutoLoader.php
@@ -73,6 +73,7 @@
'FeedItem' => 'includes/Feed.php',
'FeedUtils' => 'includes/FeedUtils.php',
'FileDeleteForm' => 'includes/FileDeleteForm.php',
+ 'ForeignTitle' => 'includes/Import.php',
'ForkController' => 'includes/ForkController.php',
'FormOptions' => 'includes/FormOptions.php',
'GitInfo' => 'includes/GitInfo.php',
diff --git a/includes/Import.php b/includes/Import.php
index 2eb5667..090fa98 100644
--- a/includes/Import.php
+++ b/includes/Import.php
@@ -25,6 +25,133 @@
*/
/**
+ * A simple, immutable structure to hold the title of a page on a foreign
+ * MediaWiki installation.
+ */
+
+class ForeignTitle {
+ /** @var int */
+ private $namespaceId;
+ /** @var string */
+ private $namespaceName;
+ /** @var string */
+ private $pageName;
+
+ /**
+ * @param int $namespaceId
+ * @param string $namespaceName
+ * @param string $pageName
+ */
+ protected function __construct( $namespaceId, $namespaceName, $pageName
) {
+ $this->namespaceId = $namespaceId;
+ $this->namespaceName = $namespaceName;
+ $this->pageName = $pageName;
+ }
+
+ /**
+ * Creates a ForeignTitle object based on the <title> and <ns>
attributes
+ * and the list of namespaces found in the exported XML dump.
+ *
+ * Although exported XML dumps have contained a map of namespace IDs to
names
+ * since MW 1.5, the importer used to completely ignore the <siteinfo>
tag
+ * before MW 1.24. It is therefore possible that custom XML dumps
(i.e. not
+ * generated by Special:Export) have been created without this
metadata, and
+ * it would be a retrograde step if these simply stopped working in
current
+ * versions. As a result, this code falls back to using namespace data
for
+ * the local wiki (similar to buggy pre-1.24 behaviour) if
$foreignNamespaces
+ * is empty.
+ *
+ * @param string $title The <title> element from the XML
+ * @param string|null $ns The <ns> element from the XML, or null if no
such
+ * element is present
+ * @param array|null $foreignNamespaces An array 'id' => 'name' of the
data
+ * under siteinfo/namespaces in the XML dump, or null if no such data is
+ * available
+ * @return ForeignTitle
+ */
+ public static function newFromXmlData( $title, $ns, $foreignNamespaces
) {
+ global $wgContLang;
+ $pieces = explode( ':', $title, 2 );
+
+ // Export schema version 0.5 and earlier (MW 1.18 and earlier)
does not
+ // contain a <ns> tag, so we need to be able to handle that
case.
+ if ( is_null( $ns ) ) {
+ if ( is_null( $foreignNamespaces ) ) {
+ // We don't know much, so we just use the local
wiki's namespace names
+ // to resolve this. Better than nothing, and
mimics old crappy behavior
+ if ( count( $pieces ) === 2 &&
+ $wgContLang->getNsIndex(
$pieces[0] ) !== false ) {
+ $namespaceName = $pieces[0];
+ $pageName = $pieces[1];
+ } else {
+ $namespaceName = '';
+ $pageName = $title;
+ }
+ $ns = $wgContLang->getNsIndex( $pieces[0] );
+ } else {
+ // Does the part before the colon match a known
foreign namespace?
+ if ( count( $pieces ) === 2 &&
+ in_array( $pieces[0],
$foreignNamespaces ) ) {
+ $namespaceName = $pieces[0];
+ $pageName = $pieces[1];
+ } else {
+ // Colon is semantically meaningless
+ $namespaceName = '';
+ $pageName = $title;
+ }
+ $ns = array_search( $namespaceName,
$foreignNamespaces );
+ }
+ } else {
+ // We know the namespace ID
+ if ( is_null( $foreignNamespaces ) ) {
+ if ( $ns == '0' ) {
+ $namespaceName = '';
+ $pageName = $title;
+ } else {
+ list( $namespaceName, $pageName ) =
$pieces;
+ }
+ } else {
+ $namespaceName = $foreignNamespaces[$ns];
+ // Make sure we're not in the main namespace,
where the colon is
+ // semantically meaningless
+ if ( count( $pieces ) === 2 && $namespaceName
!== "" ) {
+ $pageName = $pieces[1];
+ } else {
+ $pageName = $title;
+ }
+ }
+ }
+
+ return new ForeignTitle( $ns, $namespaceName, $pageName );
+ }
+
+ /** @return int */
+ public function getNamespaceId() {
+ return $this->namespaceId;
+ }
+
+ /** @return string */
+ public function getNamespaceName() {
+ return $this->namespaceName;
+ }
+
+ /** @return string */
+ public function getPageName() {
+ return $this->pageName;
+ }
+
+ /** @return string */
+ public function getFullText() {
+ $result = '';
+ if ( $this->namespaceName ) {
+ $result .= $this->namespaceName . ':';
+ }
+ $result .= $this->pageName;
+ return $result;
+ }
+}
+
+/**
* XML file reader for the page data importer
*
* implements Special:Import
@@ -32,6 +159,7 @@
*/
class WikiImporter {
private $reader = null;
+ private $foreignNamespaces = null;
private $mLogItemCallback, $mUploadCallback, $mRevisionCallback,
$mPageCallback;
private $mSiteInfoCallback, $mTargetNamespace, $mTargetRootPage,
$mPageOutCallback;
private $mNoticeCallback, $mDebug;
@@ -312,13 +440,13 @@
/**
* Mostly for hook use
* @param Title $title
- * @param string $origTitle
+ * @param ForeignTitle $foreignTitle
* @param int $revCount
* @param int $sRevCount
* @param array $pageInfo
* @return bool
*/
- public function finishImportPage( $title, $origTitle, $revCount,
$sRevCount, $pageInfo ) {
+ public function finishImportPage( $title, $foreignTitle, $revCount,
$sRevCount, $pageInfo ) {
$args = func_get_args();
return wfRunHooks( 'AfterImportPage', $args );
}
@@ -341,6 +469,20 @@
}
/**
+ * Notify the callback function of site info
+ * @param array $siteInfo
+ * @return bool|mixed
+ */
+ private function siteInfoCallback( $siteInfo ) {
+ if ( isset( $this->mSiteInfoCallback ) ) {
+ return call_user_func_array( $this->mSiteInfoCallback,
+ array( $siteInfo, $this ) );
+ } else {
+ return false;
+ }
+ }
+
+ /**
* Notify the callback function when a new "<page>" is reached.
* @param Title $title
*/
@@ -353,12 +495,12 @@
/**
* Notify the callback function when a "</page>" is closed.
* @param Title $title
- * @param Title $origTitle
+ * @param ForeignTitle $foreignTitle
* @param int $revCount
* @param int $sucCount Number of revisions for which callback returned
true
* @param array $pageInfo Associative array of page information
*/
- private function pageOutCallback( $title, $origTitle, $revCount,
$sucCount, $pageInfo ) {
+ private function pageOutCallback( $title, $foreignTitle, $revCount,
$sucCount, $pageInfo ) {
if ( isset( $this->mPageOutCallback ) ) {
$args = func_get_args();
call_user_func_array( $this->mPageOutCallback, $args );
@@ -526,18 +668,31 @@
return true;
}
- /**
- * @return bool
- * @throws MWException
- */
private function handleSiteInfo() {
- // Site info is useful, but not actually used for dump imports.
- // Includes a quick short-circuit to save performance.
- if ( !$this->mSiteInfoCallback ) {
- $this->reader->next();
- return true;
+ $this->debug( "Enter site info handler." );
+ $siteInfo = array();
+
+ // Fields that can just be stuffed in the siteInfo object
+ $normalFields = array( 'sitename', 'base', 'generator', 'case'
);
+
+ while ( $this->reader->read() ) {
+ if ( $this->reader->nodeType == XmlReader::END_ELEMENT
&&
+ $this->reader->name == 'siteinfo' ) {
+ break;
+ }
+
+ $tag = $this->reader->name;
+
+ if ( $tag == 'namespace' ) {
+ $this->foreignNamespaces[ $this->nodeAttribute(
'key' ) ] =
+ $this->nodeContents();
+ } elseif ( in_array( $tag, $normalFields ) ) {
+ $siteInfo[$tag] = $this->nodeContents();
+ }
}
- throw new MWException( "SiteInfo tag is not yet handled, do not
set mSiteInfoCallback" );
+
+ $siteInfo['_namespaces'] = $this->foreignNamespaces;
+ $this->siteInfoCallback( $siteInfo );
}
private function handleLogItem() {
@@ -607,7 +762,7 @@
$pageInfo = array( 'revisionCount' => 0,
'successfulRevisionCount' => 0 );
// Fields that can just be stuffed in the pageInfo object
- $normalFields = array( 'title', 'id', 'redirect',
'restrictions' );
+ $normalFields = array( 'title', 'ns', 'id', 'redirect',
'restrictions' );
$skip = false;
$badTitle = false;
@@ -617,6 +772,8 @@
$this->reader->name == 'page' ) {
break;
}
+
+ $skip = false;
$tag = $this->reader->name;
@@ -638,29 +795,35 @@
$pageInfo[$tag] = $this->nodeAttribute(
'title' );
} else {
$pageInfo[$tag] = $this->nodeContents();
- if ( $tag == 'title' ) {
- $title = $this->processTitle(
$pageInfo['title'] );
+ }
+ } elseif ( $tag == 'revision' || $tag == 'upload' ) {
+ if ( !isset( $title ) ) {
+ $title = $this->processTitle(
$pageInfo['title'],
+ isset( $pageInfo['ns'] ) ?
$pageInfo['ns'] : null );
- if ( !$title ) {
- $badTitle = true;
- $skip = true;
- }
+ if ( !$title ) {
+ $badTitle = true;
+ $skip = true;
+ }
- $this->pageCallback( $title );
- list( $pageInfo['_title'],
$origTitle ) = $title;
+ $this->pageCallback( $title );
+ list( $pageInfo['_title'],
$foreignTitle ) = $title;
+ }
+
+ if ( $title ) {
+ if ( $tag == 'revision' ) {
+ $this->handleRevision(
$pageInfo );
+ } else {
+ $this->handleUpload( $pageInfo
);
}
}
- } elseif ( $tag == 'revision' ) {
- $this->handleRevision( $pageInfo );
- } elseif ( $tag == 'upload' ) {
- $this->handleUpload( $pageInfo );
} elseif ( $tag != '#text' ) {
$this->warn( "Unhandled page XML tag $tag" );
$skip = true;
}
}
- $this->pageOutCallback( $pageInfo['_title'], $origTitle,
+ $this->pageOutCallback( $pageInfo['_title'], $foreignTitle,
$pageInfo['revisionCount'],
$pageInfo['successfulRevisionCount'],
$pageInfo );
@@ -885,24 +1048,34 @@
/**
* @param string $text
+ * @param string|null $ns
* @return array|bool
*/
- private function processTitle( $text ) {
- global $wgCommandLineMode;
+ private function processTitle( $text, $ns = null ) {
+ global $wgContLang, $wgCommandLineMode;
$workTitle = $text;
- $origTitle = Title::newFromText( $workTitle );
+ $foreignTitle = ForeignTitle::newFromXmlData( $text, $ns,
$this->foreignNamespaces );
- if ( !is_null( $this->mTargetNamespace ) && !is_null(
$origTitle ) ) {
- # makeTitleSafe, because $origTitle can have a
interwiki (different setting of interwiki map)
- # and than dbKey can begin with a lowercase char
- $title = Title::makeTitleSafe( $this->mTargetNamespace,
- $origTitle->getDBkey() );
+ if ( !is_null( $this->mTargetRootPage ) ) {
+ # We don't need to use ForeignTitle here, as the target
root page will
+ # give us the namespace component
+ $title = Title::newFromText( $this->mTargetRootPage .
'/' . $workTitle );
} else {
- if ( !is_null( $this->mTargetRootPage ) ) {
- $workTitle = $this->mTargetRootPage . '/' .
$workTitle;
+ $targetNs = $this->mTargetNamespace ?:
$foreignTitle->getNamespaceId();
+ if ( MWNamespace::exists( $targetNs ) ) {
+ # We have a matching namespace ID locally, so
put the page there
+ $title = Title::makeTitleSafe( $targetNs,
$foreignTitle->getPageName() );
+ } else {
+ # Try for a matching namespace name
+ $targetNs = $wgContLang->getNsIndex(
$foreignTitle->getNamespaceName() );
+ if ( $targetNs !== false ) {
+ $title = Title::makeTitleSafe(
$targetNs, $foreignTitle->getPageName() );
+ } else {
+ # Just fall back to main namespace
+ $title = Title::makeTitleSafe( 0,
$foreignTitle->getFullText() );
+ }
}
- $title = Title::newFromText( $workTitle );
}
if ( is_null( $title ) ) {
@@ -925,7 +1098,7 @@
return false;
}
- return array( $title, $origTitle );
+ return array( $title, $foreignTitle );
}
}
diff --git a/includes/specials/SpecialImport.php
b/includes/specials/SpecialImport.php
index 28e6479..466622e 100644
--- a/includes/specials/SpecialImport.php
+++ b/includes/specials/SpecialImport.php
@@ -453,13 +453,13 @@
/**
* @param Title $title
- * @param Title $origTitle
+ * @param ForeignTitle $foreignTitle
* @param int $revisionCount
* @param int $successCount
* @param array $pageInfo
* @return void
*/
- function reportPage( $title, $origTitle, $revisionCount, $successCount,
$pageInfo ) {
+ function reportPage( $title, $foreignTitle, $revisionCount,
$successCount, $pageInfo ) {
$args = func_get_args();
call_user_func_array( $this->mOriginalPageOutCallback, $args );
@@ -488,7 +488,7 @@
$log->addEntry( 'upload', $title, $detail,
array(), $this->getUser() );
} else {
$interwiki = '[[:' . $this->mInterwiki . ':' .
- $origTitle->getPrefixedText() . ']]';
+ $foreignTitle->getFullText() . ']]';
$detail = $this->msg(
'import-logentry-interwiki-detail' )->numParams(
$successCount )->params( $interwiki
)->inContentLanguage()->text();
if ( $this->reason ) {
diff --git a/languages/i18n/en.json b/languages/i18n/en.json
index dd7c371..d50fc39 100644
--- a/languages/i18n/en.json
+++ b/languages/i18n/en.json
@@ -2301,11 +2301,11 @@
"import-upload": "Upload XML data",
"import-token-mismatch": "Loss of session data.\nPlease try again.",
"import-invalid-interwiki": "Cannot import from the specified wiki.",
- "import-error-edit": "Page \"$1\" is not imported because you are not
allowed to edit it.",
- "import-error-create": "Page \"$1\" is not imported because you are not
allowed to create it.",
- "import-error-interwiki": "Page \"$1\" is not imported because its name
is reserved for external linking (interwiki).",
- "import-error-special": "Page \"$1\" is not imported because it belongs
to a special namespace that does not allow pages.",
- "import-error-invalid": "Page \"$1\" is not imported because its name
is invalid.",
+ "import-error-edit": "Page \"$1\" was not imported because you are not
allowed to edit it.",
+ "import-error-create": "Page \"$1\" was not imported because you are
not allowed to create it.",
+ "import-error-interwiki": "Page \"$1\" was not imported because its
name is reserved for external linking (interwiki).",
+ "import-error-special": "Page \"$1\" was not imported because it
belongs to a special namespace that does not allow pages.",
+ "import-error-invalid": "Page \"$1\" was not imported because the name
to which it would be imported is invalid on this wiki.",
"import-error-unserialize": "Revision $2 of page \"$1\" could not be
unserialized. The revision was reported to use content model $3 serialized as
$4.",
"import-error-bad-location": "Revision $2 using content model $3 cannot
be stored on \"$1\" on this wiki, since that model is not supported on that
page.",
"import-options-wrong": "Wrong {{PLURAL:$2|option|options}}:
<nowiki>$1</nowiki>",
diff --git a/tests/phpunit/includes/ImportTest.php
b/tests/phpunit/includes/ImportTest.php
index f82a756..a2487cb 100644
--- a/tests/phpunit/includes/ImportTest.php
+++ b/tests/phpunit/includes/ImportTest.php
@@ -27,7 +27,7 @@
$source = $this->getInputStreamSource( $xml );
$redirect = null;
- $callback = function ( $title, $origTitle, $revCount,
$sRevCount, $pageInfo ) use ( &$redirect ) {
+ $callback = function ( $title, $foreignTitle, $revCount,
$sRevCount, $pageInfo ) use ( &$redirect ) {
if ( array_key_exists( 'redirect', $pageInfo ) ) {
$redirect = $pageInfo['redirect'];
}
--
To view, visit https://gerrit.wikimedia.org/r/149293
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Iaa58e1b9fd7287cdf999cef6a6f3bb63cd2a4778
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: TTO <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits