[MediaWiki-commits] [Gerrit] unicodejs[master]: Go back to storing strings as code units

Esanders (Code Review) Thu, 16 Nov 2017 05:36:40 -0800

Esanders has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391820 )


Change subject: Go back to storing strings as code units
......................................................................

Go back to storing strings as code units

Instead implement next/prevCodepoint methods so we can
iterate over the data structure as if it were codepoints.

Change-Id: If30035a6f63397ed2e583922200299ee228249c9
---
M src/unicodejs.js
M src/unicodejs.textstring.js
M src/unicodejs.wordbreak.js
M tests/unicodejs.test.js
M tests/unicodejs.wordbreak.test.js
5 files changed, 95 insertions(+), 50 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/unicodejs refs/changes/20/391820/1

diff --git a/src/unicodejs.js b/src/unicodejs.js
index cba801c..404c900 100644
--- a/src/unicodejs.js
+++ b/src/unicodejs.js
@@ -29,6 +29,26 @@
        };
 
        /**
+        * Check if a code unit is a the leading half of a surrogate pair
+        *
+        * @param {string} unit Code unit
+        * @return {boolean}
+        */
+       unicodeJS.isLeadingSurrogate = function ( unit ) {
+               return unit && unit.match( /^[\uD800-\uDBFF]$/ );
+       };
+
+       /**
+        * Check if a code unit is a the trailing half of a surrogate pair
+        *
+        * @param {string} unit Code unit
+        * @return {boolean}
+        */
+       unicodeJS.isTrailingSurrogate = function ( unit ) {
+               return unit && unit.match( /^[\uDC00-\uDFFF]$/ );
+       };
+
+       /**
         * Write a UTF-16 code unit as a javascript string literal.
         *
         * @private
diff --git a/src/unicodejs.textstring.js b/src/unicodejs.textstring.js
index 99246cb..038fca2 100644
--- a/src/unicodejs.textstring.js
+++ b/src/unicodejs.textstring.js
@@ -16,7 +16,7 @@
  * @param {string} text Text
  */
 unicodeJS.TextString = function UnicodeJSTextString( text ) {
-       this.codepoints = unicodeJS.splitCharacters( text );
+       this.text = text;
 };
 
 /* Methods */
@@ -29,18 +29,39 @@
  * @return {string|null} Unicode codepoint, or null if out of bounds
  */
 unicodeJS.TextString.prototype.read = function ( position ) {
-       var codepointAt = this.codepoints[ position ];
-       return codepointAt !== undefined ? codepointAt : null;
+       var dataAt = this.text[ position ];
+       return dataAt !== undefined ? dataAt : null;
 };
 
-/**
- * Return number of codepoints in the text string
- *
- * @method
- * @return {number} Number of codepoints
- */
-unicodeJS.TextString.prototype.getLength = function () {
-       return this.codepoints.length;
+unicodeJS.TextString.prototype.nextCodepoint = function ( position ) {
+       var trailing,
+               codepoint = this.read( position );
+
+       if ( unicodeJS.isLeadingSurrogate( codepoint ) ) {
+               trailing = this.read( position + 1 );
+               if ( unicodeJS.isTrailingSurrogate( trailing ) ) {
+                       codepoint += trailing;
+               }
+       }
+       return codepoint;
+};
+
+unicodeJS.TextString.prototype.prevCodepoint = function ( position ) {
+       var leading,
+               codepoint = this.read( position - 1 );
+
+       if ( unicodeJS.isTrailingSurrogate( codepoint ) ) {
+               leading = this.read( position - 2 );
+               if ( unicodeJS.isLeadingSurrogate( leading ) ) {
+                       codepoint = leading + codepoint;
+               }
+       }
+       return codepoint;
+};
+
+unicodeJS.TextString.prototype.isMidSurrogate = function ( position ) {
+       return unicodeJS.isLeadingSurrogate( this.read( position - 1 ) ) &&
+               unicodeJS.isTrailingSurrogate( this.read( position ) );
 };
 
 /**
@@ -49,5 +70,5 @@
  * @return {string} Plain javascript string
  */
 unicodeJS.TextString.prototype.toString = function () {
-       return this.codepoints.join( '' );
+       return this.text;
 };
diff --git a/src/unicodejs.wordbreak.js b/src/unicodejs.wordbreak.js
index ea1e0e3..a3a3c3d 100644
--- a/src/unicodejs.wordbreak.js
+++ b/src/unicodejs.wordbreak.js
@@ -110,14 +110,14 @@
        /**
         * Evaluates whether a position within some text is a word boundary.
         *
-        * The text object elements may be codepoints or code units (deprecated)
+        * The text object elements may be codepoints or code units
         *
         * @param {unicodeJS.TextString} string TextString
         * @param {number} pos Character position
         * @return {boolean} Is the position a word boundary
         */
        wordbreak.isBreak = function ( string, pos ) {
-               var nextRgt, nextLft,
+               var nextCodepoint, prevCodepoint, nextProperty, prevProperty,
                        lft = [],
                        rgt = [],
                        l = 0,
@@ -130,18 +130,18 @@
                        return true;
                }
 
-               // Compatibility with TextString objects that split codepoints
                // Do not break inside surrogate pair
-               if (
-                       string.read( pos - 1 ).match( /^[\uD800-\uDBFF]$/ ) &&
-                       string.read( pos ).match( /^[\uDC00-\uDFFF]$/ )
-               ) {
+               if ( string.isMidSurrogate( pos ) ) {
                        return false;
                }
 
-               // get some context
-               rgt.push( getProperty( string.read( pos + r ) ) );
-               lft.push( getProperty( string.read( pos - l - 1 ) ) );
+               // Get some context
+               nextCodepoint = string.nextCodepoint( pos + r );
+               prevCodepoint = string.prevCodepoint( pos - l );
+               rgt.push( getProperty( nextCodepoint ) );
+               lft.push( getProperty( prevCodepoint ) );
+               r += nextCodepoint.length;
+               l += prevCodepoint.length;
 
                switch ( true ) {
                        // Do not break within CRLF.
@@ -166,12 +166,13 @@
                }
                // We've reached the end of an Extend|Format sequence, collapse 
it
                while ( lft[ 0 ] === 'Extend' || lft[ 0 ] === 'Format' ) {
-                       l++;
                        if ( pos - l - 1 < 0 ) {
                                // start of document
                                return true;
                        }
-                       lft[ lft.length - 1 ] = getProperty( string.read( pos - 
l - 1 ) );
+                       prevCodepoint = string.prevCodepoint( pos - l );
+                       lft[ 0 ] = getProperty( prevCodepoint );
+                       l += prevCodepoint.length;
                }
 
                // Do not break between most letters.
@@ -185,15 +186,25 @@
 
                // Some tests beyond this point require more context, as per 
WB4 ignore Format and Extend.
                do {
-                       r++;
-                       nextRgt = getProperty( string.read( pos + r ) );
-               } while ( nextRgt === 'Extend' || nextRgt === 'Format' );
-               rgt.push( nextRgt );
+                       nextCodepoint = string.nextCodepoint( pos + r );
+                       if ( nextCodepoint === null ) {
+                               nextProperty = null;
+                               break;
+                       }
+                       r += nextCodepoint.length;
+                       nextProperty = getProperty( nextCodepoint );
+               } while ( nextProperty === 'Extend' || nextProperty === 
'Format' );
+               rgt.push( nextProperty );
                do {
-                       l++;
-                       nextLft = getProperty( string.read( pos - l - 1 ) );
-               } while ( nextLft === 'Extend' || nextLft === 'Format' );
-               lft.push( nextLft );
+                       prevCodepoint = string.prevCodepoint( pos - l );
+                       if ( prevCodepoint === null ) {
+                               prevProperty = null;
+                               break;
+                       }
+                       l += prevCodepoint.length;
+                       prevProperty = getProperty( prevCodepoint );
+               } while ( prevProperty === 'Extend' || prevProperty === 
'Format' );
+               lft.push( prevProperty );
 
                switch ( true ) {
                        // Do not break letters across certain punctuation.
diff --git a/tests/unicodejs.test.js b/tests/unicodejs.test.js
index 8cd53d1..472f0c8 100644
--- a/tests/unicodejs.test.js
+++ b/tests/unicodejs.test.js
@@ -18,12 +18,18 @@
                        data = line.split( ' ' );
 
                data.forEach( function ( str, i ) {
+                       var codepoint;
                        if ( i % 2 === 0 ) {
                                // Tests at even offsets
                                expected.push( breakMap[ str ] );
                        } else {
+                               codepoint = +( '0x' + str );
                                // Chars at odd offsets
-                               chars += String.fromCodePoint( +( '0x' + str ) 
);
+                               chars += String.fromCodePoint( codepoint );
+                               // For surrogate pairs, add an expected 
no-break between them
+                               if ( codepoint > 0xFFFF ) {
+                                       expected.push( false );
+                               }
                        }
                } );
 
diff --git a/tests/unicodejs.wordbreak.test.js 
b/tests/unicodejs.wordbreak.test.js
index 209496d..8c78c15 100644
--- a/tests/unicodejs.wordbreak.test.js
+++ b/tests/unicodejs.wordbreak.test.js
@@ -8,28 +8,16 @@
 QUnit.module( 'unicodeJS.wordbreak' );
 
 QUnit.test( 'Unicode test suite', function ( assert ) {
-       var i, textString, result;
-
        unicodeJS.testdata.wordbreak.reduce( unicodeJS.test.parseTestReduce, [] 
).forEach( function ( test ) {
+               var i,
+                       textString = new unicodeJS.TextString( test.string ),
+                       result = [];
 
-               textString = new unicodeJS.TextString( test.string );
-               result = [];
-
-               for ( i = 0; i <= textString.getLength(); i++ ) {
+               for ( i = 0; i <= test.string.length; i++ ) {
                        result.push( unicodeJS.wordbreak.isBreak( textString, i 
) );
                }
                assert.deepEqual( result, test.expected, test.msg );
        } );
-
-       textString = new unicodeJS.TextString( '' );
-       textString.codepoints = '𨋢'.split( '' );
-       result = [];
-
-       for ( i = 0; i <= textString.getLength(); i++ ) {
-               result.push( unicodeJS.wordbreak.isBreak( textString, i ) );
-       }
-       assert.deepEqual( result, [ true, false, true ], 'Basic support for 
code unit splitting' );
-
 } );
 
 QUnit.test( 'nextBreakOffset/prevBreakOffset', function ( assert ) {
@@ -98,6 +86,5 @@
        var plainString = 'abc𨋢def',
                textString = new unicodeJS.TextString( plainString );
 
-       assert.equal( textString.getLength(), 7, 'getLength' );
        assert.equal( textString.toString(), plainString, 'toString' );
 } );

-- 
To view, visit https://gerrit.wikimedia.org/r/391820
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If30035a6f63397ed2e583922200299ee228249c9
Gerrit-PatchSet: 1
Gerrit-Project: unicodejs
Gerrit-Branch: master
Gerrit-Owner: Esanders <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] unicodejs[master]: Go back to storing strings as code units

Reply via email to