Daniel Kinzler has submitted this change and it was merged.

Change subject: (Bug 45111) Change squashing to trimming of whitespace and 
control chars
......................................................................


(Bug 45111) Change squashing to trimming of whitespace and control chars

The final version of this removes most of the squashing of white space
and control chars and only trims leading and trailing white space and
replaces sequences of the old control chars (the lower block) with spaces.

This means that form feed, tab, carriage return, new line, etc, are replaced
with a single white space because they are control chars. The chars zero width
joiner and zero width non-joiner will be passed on unchanged as they are in the
control char formatting block.

Change-Id: Icc9e48d33b9f5c4ae0a3fd8000c4c54d2883b66d
---
M lib/includes/Term.php
M lib/includes/Utils.php
M lib/tests/phpunit/UtilsTest.php
M repo/includes/Autocomment.php
M repo/includes/api/EditEntity.php
M repo/includes/api/GetEntities.php
M repo/includes/api/ModifyEntity.php
M repo/includes/api/SetAliases.php
M repo/includes/api/SetDescription.php
M repo/includes/api/SetLabel.php
M repo/includes/api/SetSiteLink.php
M repo/includes/specials/SpecialCreateEntity.php
12 files changed, 51 insertions(+), 49 deletions(-)

Approvals:
  Daniel Kinzler: Verified; Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/includes/Term.php b/lib/includes/Term.php
index 3de40f3..dbfaad2 100644
--- a/lib/includes/Term.php
+++ b/lib/includes/Term.php
@@ -233,7 +233,7 @@
                //      But that requires us to load ALL the language objects,
                //      which loads ALL the messages, which makes us run out
                //      of RAM (see bug 41103).
-               return mb_strtolower( Utils::squashToNFC( $text ) );
+               return mb_strtolower( Utils::trimToNFC( $text ) );
        }
 
        /**
diff --git a/lib/includes/Utils.php b/lib/includes/Utils.php
index 9cfe4cd..2345c26 100644
--- a/lib/includes/Utils.php
+++ b/lib/includes/Utils.php
@@ -189,7 +189,7 @@
        }
 
        /**
-        * Trim initial and trailing whitespace, and compress internal ones.
+        * Trim initial and trailing whitespace and control chars, and 
optionally compress internal ones.
         *
         * @since 0.1
         *
@@ -197,9 +197,12 @@
         *
         * @return string where whitespace possibly are removed.
         */
-       static public function squashWhitespace( $inputString ) {
-               $trimmed = preg_replace( '/^[\pZ\pC]+|[\pZ\pC]+$/u', '', 
$inputString );
-               return preg_replace('/[\pZ\pC]+/u', ' ', $trimmed );
+       static public function trimWhitespace( $inputString ) {
+               // \p{Z} - whitespace
+               // \p{Cc} - control chars
+               $trimmed = preg_replace( '/^[\p{Z}\p{Cc}]+|[\p{Z}\p{Cc}]+$/u', 
'', $inputString );
+               $trimmed = preg_replace( '/[\p{Cc}]+/u', ' ', $trimmed );
+               return $trimmed;
        }
 
        /**
@@ -224,8 +227,8 @@
         *
         * @return string on NFC form
         */
-       static public function squashToNFC( $inputString ) {
-               return self::cleanupToNFC( self::squashWhitespace( $inputString 
) );
+       static public function trimToNFC( $inputString ) {
+               return self::cleanupToNFC( self::trimWhitespace( $inputString ) 
);
        }
 
        /**
diff --git a/lib/tests/phpunit/UtilsTest.php b/lib/tests/phpunit/UtilsTest.php
index 7c7d38e..db166c7 100644
--- a/lib/tests/phpunit/UtilsTest.php
+++ b/lib/tests/phpunit/UtilsTest.php
@@ -46,21 +46,24 @@
 
        /**
         * @group WikibaseUtils
-        * @dataProvider providerSquashWhitespace
+        * @dataProvider providerTrimWhitespace
         */
-       public function testSquashWhitespace( $string, $expected ) {
-               $this->assertEquals( $expected, Utils::squashWhitespace( 
$string ) );
+       public function testTrimWhitespace( $string, $expected ) {
+               $this->assertEquals( $expected, Utils::trimWhitespace( $string 
) );
        }
 
-       public static function providerSquashWhitespace() {
+       public static function providerTrimWhitespace() {
                return array(
-                       array( 'foo bar', 'foo bar'),
-                       array( ' foo  bar ', 'foo bar'),
-                       array( '  foo   bar  ', 'foo bar'),
-                       array( "foo\tbar", 'foo bar'),
-                       array( "foo\nbar", 'foo bar'),
-                       array( "foo\rbar", 'foo bar'),
-                       array( "\r \t\nfoo\r\t\t\tbar\n\n\n\r\r", 'foo bar'),
+                       array( 'foo bar', 'foo bar'), // #0
+                       array( ' foo  bar ', 'foo  bar'), // #1
+                       array( '  foo   bar  ', 'foo   bar'), // #2
+                       array( "foo\tbar", 'foo bar'), // #3, both a space and 
control char
+                       array( "foo\nbar", 'foo bar'), // #4, both a space and 
control char
+                       array( "foo\rbar", 'foo bar'), // #5, both a space and 
control char
+                       array( "\r \t\nfoo\r\t\t\tbar\n\n\n\r\r", 'foo bar'), 
// #6, both space and control chars
+                       array( "\r \t\nfoo\r\t\t\t bar\n\n\n\r\r", 'foo  bar'), 
// #7, both space and control chars
+                       array( html_entity_decode( "foo‌bar", ENT_QUOTES, 
"utf-8"), html_entity_decode( "foo‌bar", ENT_QUOTES, "utf-8") ), // #8
+                       array( html_entity_decode( "foo‌‌bar", 
ENT_QUOTES, "utf-8"), html_entity_decode( "foo‌‌bar", ENT_QUOTES, 
"utf-8") ), // #9
                );
        }
 
@@ -87,18 +90,18 @@
 
        /**
         * @group WikibaseUtils
-        * @dataProvider providerSquashToNFC
+        * @dataProvider providerTrimToNFC
         */
-       public function testSquashToNFC( $src, $dst ) {
-               $this->assertEquals( $dst, Utils::squashToNFC( $src ), "String 
'$src' is not the same as the expected '$dst'" );
+       public function testTrimToNFC( $src, $dst ) {
+               $this->assertEquals( $dst, Utils::trimToNFC( $src ), "String 
'$src' is not the same as the expected '$dst'" );
        }
 
-       public static function providerSquashToNFC() {
+       public static function providerTrimToNFC() {
                return array(
-                       array( "  \xC3\x85land  øyene  ", 'Åland øyene' ),
-                       array( "  A\xCC\x8Aland  øyene  ", 'Åland øyene' ),
-                       array( "  \xC3\x85land    øyene  ", 'Åland øyene' ),
-                       array( "  A\xCC\x8Aland    øyene  ", 'Åland øyene' ),
+                       array( "  \xC3\x85land  øyene  ", 'Åland  øyene' ), // 
#0
+                       array( "  A\xCC\x8Aland  øyene  ", 'Åland  øyene' ), // 
#1
+                       array( "  \xC3\x85land    øyene  ", 'Åland    øyene' ), 
// #2
+                       array( "  A\xCC\x8Aland    øyene  ", 'Åland    øyene' 
), // #3
                );
        }
 
diff --git a/repo/includes/Autocomment.php b/repo/includes/Autocomment.php
index d309794..d10bb04 100644
--- a/repo/includes/Autocomment.php
+++ b/repo/includes/Autocomment.php
@@ -204,8 +204,8 @@
                if ( $lang === null || $lang === false) {
                        $lang = $wgContLang;
                }
-               $comment = Utils::squashToNFC( $comment );
-               $summary = Utils::squashToNFC( $summary );
+               $comment = Utils::trimToNFC( $comment );
+               $summary = Utils::trimToNFC( $summary );
                $mergedString = '';
                if ( $comment !== '' ) {
                        $mergedString .=  "/* $comment */";
diff --git a/repo/includes/api/EditEntity.php b/repo/includes/api/EditEntity.php
index 8570616..8b14299 100644
--- a/repo/includes/api/EditEntity.php
+++ b/repo/includes/api/EditEntity.php
@@ -198,7 +198,7 @@
                                                        
$entityContent->getEntity()->removeLabel( $arg['language'] );
                                                }
                                                else {
-                                                       
$entityContent->getEntity()->setLabel( $arg['language'], Utils::squashToNFC( 
$arg['value'] ) );
+                                                       
$entityContent->getEntity()->setLabel( $arg['language'], Utils::trimToNFC( 
$arg['value'] ) );
                                                }
                                        }
 
@@ -221,7 +221,7 @@
                                                        
$entityContent->getEntity()->removeDescription( $arg['language'] );
                                                }
                                                else {
-                                                       
$entityContent->getEntity()->setDescription( $arg['language'], 
Utils::squashToNFC( $arg['value'] ) );
+                                                       
$entityContent->getEntity()->setDescription( $arg['language'], 
Utils::trimToNFC( $arg['value'] ) );
                                                }
                                        }
 
@@ -256,13 +256,13 @@
                                                foreach ( $args as $arg ) {
                                                        $status->merge( 
$this->checkMultilangArgs( $arg, $langCode, $languages ) );
                                                        if ( array_key_exists( 
'remove', $arg ) ) {
-                                                               
$remAliases[$arg['language']][] = Utils::squashToNFC( $arg['value'] );
+                                                               
$remAliases[$arg['language']][] = Utils::trimToNFC( $arg['value'] );
                                                        }
                                                        elseif ( 
array_key_exists( 'add', $arg ) ) {
-                                                               
$addAliases[$arg['language']][] = Utils::squashToNFC( $arg['value'] );
+                                                               
$addAliases[$arg['language']][] = Utils::trimToNFC( $arg['value'] );
                                                        }
                                                        else {
-                                                               
$setAliases[$arg['language']][] = Utils::squashToNFC( $arg['value'] );
+                                                               
$setAliases[$arg['language']][] = Utils::trimToNFC( $arg['value'] );
                                                        }
                                                }
                                        }
@@ -310,7 +310,7 @@
                                                }
                                                else {
                                                        $linkSite = 
$sites->getSite( $arg['site'] );
-                                                       $linkPage = 
$linkSite->normalizePageName( $arg['title'] );
+                                                       $linkPage = 
$linkSite->normalizePageName( Utils::trimWhitespace( $arg['title'] ) );
 
                                                        if ( $linkPage === 
false ) {
                                                                wfProfileOut( 
__METHOD__ );
diff --git a/repo/includes/api/GetEntities.php 
b/repo/includes/api/GetEntities.php
index 2432d17..ae2ba9e 100644
--- a/repo/includes/api/GetEntities.php
+++ b/repo/includes/api/GetEntities.php
@@ -57,7 +57,7 @@
 
                                for ( $k = 0; $k < $max; $k++ ) {
                                        $siteId = $params['sites'][$idxSites++ 
% $numSites];
-                                       $title = Utils::squashToNFC( 
$params['titles'][$idxTitles++ % $numTitles] );
+                                       $title = Utils::trimToNFC( 
$params['titles'][$idxTitles++ % $numTitles] );
 
                                        $id = 
StoreFactory::getStore()->newSiteLinkCache()->getItemIdForLink( $siteId, $title 
);
 
diff --git a/repo/includes/api/ModifyEntity.php 
b/repo/includes/api/ModifyEntity.php
index 624847a..f207fbb 100644
--- a/repo/includes/api/ModifyEntity.php
+++ b/repo/includes/api/ModifyEntity.php
@@ -81,7 +81,7 @@
 
                        $entityTitle = $itemHandler->getTitleFromSiteLink(
                                $params['site'],
-                               Utils::squashToNFC( $params['title'] )
+                               Utils::trimToNFC( $params['title'] )
                        );
 
                        if ( is_null( $entityTitle ) ) {
@@ -236,7 +236,7 @@
                if ( isset( $params['site'] ) && isset( $params['title'] ) ) {
                        $normalized = array();
 
-                       $normTitle = Utils::squashToNFC( $params['title'] );
+                       $normTitle = Utils::trimToNFC( $params['title'] );
                        if ( $normTitle !== $params['title'] ) {
                                $normalized['from'] = $params['title'];
                                $normalized['to'] = $normTitle;
diff --git a/repo/includes/api/SetAliases.php b/repo/includes/api/SetAliases.php
index 8ec63fb..3e9e2e2 100644
--- a/repo/includes/api/SetAliases.php
+++ b/repo/includes/api/SetAliases.php
@@ -90,7 +90,7 @@
                        $entityContent->getEntity()->setAliases(
                                $params['language'],
                                array_map(
-                                       function( $str ) { return 
Utils::squashToNFC( $str ); },
+                                       function( $str ) { return 
Utils::trimToNFC( $str ); },
                                        $params['set']
                                )
                        );
@@ -100,7 +100,7 @@
                        $entityContent->getEntity()->removeAliases(
                                $params['language'],
                                array_map(
-                                       function( $str ) { return 
Utils::squashToNFC( $str ); },
+                                       function( $str ) { return 
Utils::trimToNFC( $str ); },
                                        $params['remove']
                                )
                        );
@@ -110,7 +110,7 @@
                        $entityContent->getEntity()->addAliases(
                                $params['language'],
                                array_map(
-                                       function( $str ) { return 
Utils::squashToNFC( $str ); },
+                                       function( $str ) { return 
Utils::trimToNFC( $str ); },
                                        $params['add']
                                )
                        );
diff --git a/repo/includes/api/SetDescription.php 
b/repo/includes/api/SetDescription.php
index 2a36757..ae75bb3 100644
--- a/repo/includes/api/SetDescription.php
+++ b/repo/includes/api/SetDescription.php
@@ -43,7 +43,7 @@
                wfProfileIn( __METHOD__ );
 
                if ( isset( $params['value'] ) ) {
-                       $description = Utils::squashToNFC( $params['value'] );
+                       $description = Utils::trimToNFC( $params['value'] );
                        $language = $params['language'];
                        if ( 0 < strlen( $description ) ) {
                                $descriptions = array( $language => 
$entityContent->getEntity()->setDescription( $language, $description ) );
diff --git a/repo/includes/api/SetLabel.php b/repo/includes/api/SetLabel.php
index 8f79f16..a8f6e9e 100644
--- a/repo/includes/api/SetLabel.php
+++ b/repo/includes/api/SetLabel.php
@@ -43,7 +43,7 @@
                wfProfileIn( __METHOD__ );
 
                if ( isset( $params['value'] ) ) {
-                       $label = Utils::squashToNFC( $params['value'] );
+                       $label = Utils::trimToNFC( $params['value'] );
                        $language = $params['language'];
                        if ( 0 < strlen( $label ) ) {
                                $labels = array( $language => 
$entityContent->getEntity()->setLabel( $language, $label ) );
diff --git a/repo/includes/api/SetSiteLink.php 
b/repo/includes/api/SetSiteLink.php
index d863abd..486d891 100644
--- a/repo/includes/api/SetSiteLink.php
+++ b/repo/includes/api/SetSiteLink.php
@@ -91,12 +91,8 @@
        protected function modifyEntity( EntityContent &$entityContent, array 
$params ) {
                wfProfileIn( __METHOD__ );
 
-               if ( isset( $params['linktitle'] ) ) {
-                       $params['linktitle'] = Utils::squashToNFC( 
$params['linktitle'] );
-               }
-
                if ( isset( $params['linksite'] ) && ( $params['linktitle'] === 
'' ) ) {
-                       $link = $entityContent->getItem()->getSiteLink( 
$params['linksite'] );
+                       $link = $entityContent->getItem()->getSiteLink( 
Utils::trimToNFC( $params['linksite'] ) );
 
                        if ( !$link ) {
                                wfProfileOut( __METHOD__ );
@@ -118,7 +114,7 @@
                                $this->dieUsage( $this->msg( 
'wikibase-api-not-recognized-siteid' )->text(), 'not-recognized-siteid' );
                        }
 
-                       $page = $site->normalizePageName( $params['linktitle'] 
);
+                       $page = $site->normalizePageName( 
Utils::trimWhitespace( $params['linktitle'] ) );
 
                        if ( $page === false ) {
                                wfProfileOut( __METHOD__ );
diff --git a/repo/includes/specials/SpecialCreateEntity.php 
b/repo/includes/specials/SpecialCreateEntity.php
index f2400f2..34d2c5f 100644
--- a/repo/includes/specials/SpecialCreateEntity.php
+++ b/repo/includes/specials/SpecialCreateEntity.php
@@ -156,7 +156,7 @@
         * @return bool
         */
        protected function hasSufficientArguments() {
-               return Utils::squashWhitespace( $this->label ) !== '' || 
Utils::squashWhitespace( $this->description ) !== '';
+               return Utils::trimWhitespace( $this->label ) !== '' || 
Utils::trimWhitespace( $this->description ) !== '';
        }
 
        /**

-- 
To view, visit https://gerrit.wikimedia.org/r/50365
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Icc9e48d33b9f5c4ae0a3fd8000c4c54d2883b66d
Gerrit-PatchSet: 7
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: John Erling Blad <[email protected]>
Gerrit-Reviewer: Anja Jentzsch <[email protected]>
Gerrit-Reviewer: Aude <[email protected]>
Gerrit-Reviewer: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Jens Ohlig <[email protected]>
Gerrit-Reviewer: John Erling Blad <[email protected]>
Gerrit-Reviewer: Tobias Gritschacher <[email protected]>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to