Anja Jentzsch has uploaded a new change for review.
https://gerrit.wikimedia.org/r/51929
Change subject: (Bug 45111) Change squashing to trimming of whitespace and
control chars
......................................................................
(Bug 45111) Change squashing to trimming of whitespace and control chars
The final version of this removes most of the squashing of white space
and control chars and only trims leading and trailing white space and
replaces sequences of the old control chars (the lower block) with spaces.
This means that form feed, tab, carriage return, new line, etc, are replaced
with a single white space because they are control chars. The chars zero width
joiner and zero width non-joiner will be passed on unchanged as they are in the
control char formatting block.
Change-Id: Icc9e48d33b9f5c4ae0a3fd8000c4c54d2883b66d
---
M lib/includes/Term.php
M lib/includes/Utils.php
M lib/tests/phpunit/UtilsTest.php
M repo/includes/Autocomment.php
M repo/includes/api/EditEntity.php
M repo/includes/api/GetEntities.php
M repo/includes/api/ModifyEntity.php
M repo/includes/api/SetAliases.php
M repo/includes/api/SetDescription.php
M repo/includes/api/SetLabel.php
M repo/includes/api/SetSiteLink.php
M repo/includes/specials/SpecialCreateEntity.php
12 files changed, 51 insertions(+), 49 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase
refs/changes/29/51929/1
diff --git a/lib/includes/Term.php b/lib/includes/Term.php
index 3de40f3..dbfaad2 100644
--- a/lib/includes/Term.php
+++ b/lib/includes/Term.php
@@ -233,7 +233,7 @@
// But that requires us to load ALL the language objects,
// which loads ALL the messages, which makes us run out
// of RAM (see bug 41103).
- return mb_strtolower( Utils::squashToNFC( $text ) );
+ return mb_strtolower( Utils::trimToNFC( $text ) );
}
/**
diff --git a/lib/includes/Utils.php b/lib/includes/Utils.php
index d9dcf5b..b3b0a10 100644
--- a/lib/includes/Utils.php
+++ b/lib/includes/Utils.php
@@ -179,7 +179,7 @@
}
/**
- * Trim initial and trailing whitespace, and compress internal ones.
+ * Trim initial and trailing whitespace and control chars, and
optionally compress internal ones.
*
* @since 0.1
*
@@ -187,9 +187,12 @@
*
* @return string where whitespace possibly are removed.
*/
- static public function squashWhitespace( $inputString ) {
- $trimmed = preg_replace( '/^[\pZ\pC]+|[\pZ\pC]+$/u', '',
$inputString );
- return preg_replace('/[\pZ\pC]+/u', ' ', $trimmed );
+ static public function trimWhitespace( $inputString ) {
+ // \p{Z} - whitespace
+ // \p{Cc} - control chars
+ $trimmed = preg_replace( '/^[\p{Z}\p{Cc}]+|[\p{Z}\p{Cc}]+$/u',
'', $inputString );
+ $trimmed = preg_replace( '/[\p{Cc}]+/u', ' ', $trimmed );
+ return $trimmed;
}
/**
@@ -214,8 +217,8 @@
*
* @return string on NFC form
*/
- static public function squashToNFC( $inputString ) {
- return self::cleanupToNFC( self::squashWhitespace( $inputString
) );
+ static public function trimToNFC( $inputString ) {
+ return self::cleanupToNFC( self::trimWhitespace( $inputString )
);
}
/**
diff --git a/lib/tests/phpunit/UtilsTest.php b/lib/tests/phpunit/UtilsTest.php
index 819c015..f84d6e2 100644
--- a/lib/tests/phpunit/UtilsTest.php
+++ b/lib/tests/phpunit/UtilsTest.php
@@ -46,21 +46,24 @@
/**
* @group WikibaseUtils
- * @dataProvider providerSquashWhitespace
+ * @dataProvider providerTrimWhitespace
*/
- public function testSquashWhitespace( $string, $expected ) {
- $this->assertEquals( $expected, Utils::squashWhitespace(
$string ) );
+ public function testTrimWhitespace( $string, $expected ) {
+ $this->assertEquals( $expected, Utils::trimWhitespace( $string
) );
}
- public static function providerSquashWhitespace() {
+ public static function providerTrimWhitespace() {
return array(
- array( 'foo bar', 'foo bar'),
- array( ' foo bar ', 'foo bar'),
- array( ' foo bar ', 'foo bar'),
- array( "foo\tbar", 'foo bar'),
- array( "foo\nbar", 'foo bar'),
- array( "foo\rbar", 'foo bar'),
- array( "\r \t\nfoo\r\t\t\tbar\n\n\n\r\r", 'foo bar'),
+ array( 'foo bar', 'foo bar'), // #0
+ array( ' foo bar ', 'foo bar'), // #1
+ array( ' foo bar ', 'foo bar'), // #2
+ array( "foo\tbar", 'foo bar'), // #3, both a space and
control char
+ array( "foo\nbar", 'foo bar'), // #4, both a space and
control char
+ array( "foo\rbar", 'foo bar'), // #5, both a space and
control char
+ array( "\r \t\nfoo\r\t\t\tbar\n\n\n\r\r", 'foo bar'),
// #6, both space and control chars
+ array( "\r \t\nfoo\r\t\t\t bar\n\n\n\r\r", 'foo bar'),
// #7, both space and control chars
+ array( html_entity_decode( "foo‌bar", ENT_QUOTES,
"utf-8"), html_entity_decode( "foo‌bar", ENT_QUOTES, "utf-8") ), // #8
+ array( html_entity_decode( "foo‌‌bar",
ENT_QUOTES, "utf-8"), html_entity_decode( "foo‌‌bar", ENT_QUOTES,
"utf-8") ), // #9
);
}
@@ -87,18 +90,18 @@
/**
* @group WikibaseUtils
- * @dataProvider providerSquashToNFC
+ * @dataProvider providerTrimToNFC
*/
- public function testSquashToNFC( $src, $dst ) {
- $this->assertEquals( $dst, Utils::squashToNFC( $src ), "String
'$src' is not the same as the expected '$dst'" );
+ public function testTrimToNFC( $src, $dst ) {
+ $this->assertEquals( $dst, Utils::trimToNFC( $src ), "String
'$src' is not the same as the expected '$dst'" );
}
- public static function providerSquashToNFC() {
+ public static function providerTrimToNFC() {
return array(
- array( " \xC3\x85land øyene ", 'Åland øyene' ),
- array( " A\xCC\x8Aland øyene ", 'Åland øyene' ),
- array( " \xC3\x85land øyene ", 'Åland øyene' ),
- array( " A\xCC\x8Aland øyene ", 'Åland øyene' ),
+ array( " \xC3\x85land øyene ", 'Åland øyene' ), //
#0
+ array( " A\xCC\x8Aland øyene ", 'Åland øyene' ), //
#1
+ array( " \xC3\x85land øyene ", 'Åland øyene' ),
// #2
+ array( " A\xCC\x8Aland øyene ", 'Åland øyene'
), // #3
);
}
diff --git a/repo/includes/Autocomment.php b/repo/includes/Autocomment.php
index d309794..d10bb04 100644
--- a/repo/includes/Autocomment.php
+++ b/repo/includes/Autocomment.php
@@ -204,8 +204,8 @@
if ( $lang === null || $lang === false) {
$lang = $wgContLang;
}
- $comment = Utils::squashToNFC( $comment );
- $summary = Utils::squashToNFC( $summary );
+ $comment = Utils::trimToNFC( $comment );
+ $summary = Utils::trimToNFC( $summary );
$mergedString = '';
if ( $comment !== '' ) {
$mergedString .= "/* $comment */";
diff --git a/repo/includes/api/EditEntity.php b/repo/includes/api/EditEntity.php
index 8570616..8b14299 100644
--- a/repo/includes/api/EditEntity.php
+++ b/repo/includes/api/EditEntity.php
@@ -198,7 +198,7 @@
$entityContent->getEntity()->removeLabel( $arg['language'] );
}
else {
-
$entityContent->getEntity()->setLabel( $arg['language'], Utils::squashToNFC(
$arg['value'] ) );
+
$entityContent->getEntity()->setLabel( $arg['language'], Utils::trimToNFC(
$arg['value'] ) );
}
}
@@ -221,7 +221,7 @@
$entityContent->getEntity()->removeDescription( $arg['language'] );
}
else {
-
$entityContent->getEntity()->setDescription( $arg['language'],
Utils::squashToNFC( $arg['value'] ) );
+
$entityContent->getEntity()->setDescription( $arg['language'],
Utils::trimToNFC( $arg['value'] ) );
}
}
@@ -256,13 +256,13 @@
foreach ( $args as $arg ) {
$status->merge(
$this->checkMultilangArgs( $arg, $langCode, $languages ) );
if ( array_key_exists(
'remove', $arg ) ) {
-
$remAliases[$arg['language']][] = Utils::squashToNFC( $arg['value'] );
+
$remAliases[$arg['language']][] = Utils::trimToNFC( $arg['value'] );
}
elseif (
array_key_exists( 'add', $arg ) ) {
-
$addAliases[$arg['language']][] = Utils::squashToNFC( $arg['value'] );
+
$addAliases[$arg['language']][] = Utils::trimToNFC( $arg['value'] );
}
else {
-
$setAliases[$arg['language']][] = Utils::squashToNFC( $arg['value'] );
+
$setAliases[$arg['language']][] = Utils::trimToNFC( $arg['value'] );
}
}
}
@@ -310,7 +310,7 @@
}
else {
$linkSite =
$sites->getSite( $arg['site'] );
- $linkPage =
$linkSite->normalizePageName( $arg['title'] );
+ $linkPage =
$linkSite->normalizePageName( Utils::trimWhitespace( $arg['title'] ) );
if ( $linkPage ===
false ) {
wfProfileOut(
__METHOD__ );
diff --git a/repo/includes/api/GetEntities.php
b/repo/includes/api/GetEntities.php
index 2432d17..ae2ba9e 100644
--- a/repo/includes/api/GetEntities.php
+++ b/repo/includes/api/GetEntities.php
@@ -57,7 +57,7 @@
for ( $k = 0; $k < $max; $k++ ) {
$siteId = $params['sites'][$idxSites++
% $numSites];
- $title = Utils::squashToNFC(
$params['titles'][$idxTitles++ % $numTitles] );
+ $title = Utils::trimToNFC(
$params['titles'][$idxTitles++ % $numTitles] );
$id =
StoreFactory::getStore()->newSiteLinkCache()->getItemIdForLink( $siteId, $title
);
diff --git a/repo/includes/api/ModifyEntity.php
b/repo/includes/api/ModifyEntity.php
index 624847a..f207fbb 100644
--- a/repo/includes/api/ModifyEntity.php
+++ b/repo/includes/api/ModifyEntity.php
@@ -81,7 +81,7 @@
$entityTitle = $itemHandler->getTitleFromSiteLink(
$params['site'],
- Utils::squashToNFC( $params['title'] )
+ Utils::trimToNFC( $params['title'] )
);
if ( is_null( $entityTitle ) ) {
@@ -236,7 +236,7 @@
if ( isset( $params['site'] ) && isset( $params['title'] ) ) {
$normalized = array();
- $normTitle = Utils::squashToNFC( $params['title'] );
+ $normTitle = Utils::trimToNFC( $params['title'] );
if ( $normTitle !== $params['title'] ) {
$normalized['from'] = $params['title'];
$normalized['to'] = $normTitle;
diff --git a/repo/includes/api/SetAliases.php b/repo/includes/api/SetAliases.php
index 8ec63fb..3e9e2e2 100644
--- a/repo/includes/api/SetAliases.php
+++ b/repo/includes/api/SetAliases.php
@@ -90,7 +90,7 @@
$entityContent->getEntity()->setAliases(
$params['language'],
array_map(
- function( $str ) { return
Utils::squashToNFC( $str ); },
+ function( $str ) { return
Utils::trimToNFC( $str ); },
$params['set']
)
);
@@ -100,7 +100,7 @@
$entityContent->getEntity()->removeAliases(
$params['language'],
array_map(
- function( $str ) { return
Utils::squashToNFC( $str ); },
+ function( $str ) { return
Utils::trimToNFC( $str ); },
$params['remove']
)
);
@@ -110,7 +110,7 @@
$entityContent->getEntity()->addAliases(
$params['language'],
array_map(
- function( $str ) { return
Utils::squashToNFC( $str ); },
+ function( $str ) { return
Utils::trimToNFC( $str ); },
$params['add']
)
);
diff --git a/repo/includes/api/SetDescription.php
b/repo/includes/api/SetDescription.php
index 2a36757..ae75bb3 100644
--- a/repo/includes/api/SetDescription.php
+++ b/repo/includes/api/SetDescription.php
@@ -43,7 +43,7 @@
wfProfileIn( __METHOD__ );
if ( isset( $params['value'] ) ) {
- $description = Utils::squashToNFC( $params['value'] );
+ $description = Utils::trimToNFC( $params['value'] );
$language = $params['language'];
if ( 0 < strlen( $description ) ) {
$descriptions = array( $language =>
$entityContent->getEntity()->setDescription( $language, $description ) );
diff --git a/repo/includes/api/SetLabel.php b/repo/includes/api/SetLabel.php
index 8f79f16..a8f6e9e 100644
--- a/repo/includes/api/SetLabel.php
+++ b/repo/includes/api/SetLabel.php
@@ -43,7 +43,7 @@
wfProfileIn( __METHOD__ );
if ( isset( $params['value'] ) ) {
- $label = Utils::squashToNFC( $params['value'] );
+ $label = Utils::trimToNFC( $params['value'] );
$language = $params['language'];
if ( 0 < strlen( $label ) ) {
$labels = array( $language =>
$entityContent->getEntity()->setLabel( $language, $label ) );
diff --git a/repo/includes/api/SetSiteLink.php
b/repo/includes/api/SetSiteLink.php
index d863abd..486d891 100644
--- a/repo/includes/api/SetSiteLink.php
+++ b/repo/includes/api/SetSiteLink.php
@@ -91,12 +91,8 @@
protected function modifyEntity( EntityContent &$entityContent, array
$params ) {
wfProfileIn( __METHOD__ );
- if ( isset( $params['linktitle'] ) ) {
- $params['linktitle'] = Utils::squashToNFC(
$params['linktitle'] );
- }
-
if ( isset( $params['linksite'] ) && ( $params['linktitle'] ===
'' ) ) {
- $link = $entityContent->getItem()->getSiteLink(
$params['linksite'] );
+ $link = $entityContent->getItem()->getSiteLink(
Utils::trimToNFC( $params['linksite'] ) );
if ( !$link ) {
wfProfileOut( __METHOD__ );
@@ -118,7 +114,7 @@
$this->dieUsage( $this->msg(
'wikibase-api-not-recognized-siteid' )->text(), 'not-recognized-siteid' );
}
- $page = $site->normalizePageName( $params['linktitle']
);
+ $page = $site->normalizePageName(
Utils::trimWhitespace( $params['linktitle'] ) );
if ( $page === false ) {
wfProfileOut( __METHOD__ );
diff --git a/repo/includes/specials/SpecialCreateEntity.php
b/repo/includes/specials/SpecialCreateEntity.php
index f2400f2..34d2c5f 100644
--- a/repo/includes/specials/SpecialCreateEntity.php
+++ b/repo/includes/specials/SpecialCreateEntity.php
@@ -156,7 +156,7 @@
* @return bool
*/
protected function hasSufficientArguments() {
- return Utils::squashWhitespace( $this->label ) !== '' ||
Utils::squashWhitespace( $this->description ) !== '';
+ return Utils::trimWhitespace( $this->label ) !== '' ||
Utils::trimWhitespace( $this->description ) !== '';
}
/**
--
To view, visit https://gerrit.wikimedia.org/r/51929
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Icc9e48d33b9f5c4ae0a3fd8000c4c54d2883b66d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: mw1.21-wmf11
Gerrit-Owner: Anja Jentzsch <[email protected]>
Gerrit-Reviewer: John Erling Blad <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits