jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/391612 )
Change subject: Update UnicodeJS to v0.2.0 ...................................................................... Update UnicodeJS to v0.2.0 Release notes: https://phabricator.wikimedia.org/diffusion/GUJS/browse/master/History.md;v0.2.0 Change-Id: Ica02f42216d8deb968684f43ccc5a493f4c7a368 --- M lib/unicodejs/unicodejs.js 1 file changed, 61 insertions(+), 78 deletions(-) Approvals: Catrope: Looks good to me, approved jenkins-bot: Verified diff --git a/lib/unicodejs/unicodejs.js b/lib/unicodejs/unicodejs.js index ef6a8d5..280839a 100644 --- a/lib/unicodejs/unicodejs.js +++ b/lib/unicodejs/unicodejs.js @@ -1,12 +1,12 @@ /*! - * UnicodeJS v0.1.6 + * UnicodeJS v0.2.0 * https://www.mediawiki.org/wiki/UnicodeJS * - * Copyright 2013-2016 UnicodeJS Team and other contributors. + * Copyright 2013-2017 UnicodeJS Team and other contributors. * Released under the MIT license * http://unicodejs.mit-license.org/ * - * Date: 2016-12-09T23:38:04Z + * Date: 2017-11-15T18:29:34Z */ /*! * UnicodeJS namespace @@ -154,29 +154,28 @@ for ( i = 0; i < ranges.length; i++ ) { range = ranges[ i ]; + // Handle single code unit - if ( typeof range === 'number' && range <= 0xFFFF ) { - if ( range >= 0xD800 && range <= 0xDFFF ) { - throw new Error( 'Surrogate: ' + range.toString( 16 ) ); + if ( typeof range === 'number' ) { + if ( range <= 0xFFFF ) { + if ( range >= 0xD800 && range <= 0xDFFF ) { + throw new Error( 'Surrogate: ' + range.toString( 16 ) ); + } + characterClass.push( uEsc( range ) ); + continue; + } else { + // Handle single surrogate pair + if ( range > 0x10FFFF ) { + throw new Error( 'Character code too high: ' + range.toString( 16 ) ); + } + /* eslint-disable no-bitwise */ + hi = 0xD800 + ( ( range - 0x10000 ) >> 10 ); + lo = 0xDC00 + ( ( range - 0x10000 ) & 0x3FF ); + /* eslint-enable no-bitwise */ + + disjunction.push( uEsc( hi ) + uEsc( lo ) ); + continue; } - if ( range > 0x10FFFF ) { - throw new Error( 'Character code too high: ' + - range.toString( 16 ) ); - } - characterClass.push( uEsc( range ) ); - continue; - } - - // Handle single surrogate pair - if ( typeof range === 'number' && range > 0xFFFF ) { - /* eslint-disable no-bitwise */ - - hi = 0xD800 + ( ( range - 0x10000 ) >> 10 ); - lo = 0xDC00 + ( ( range - 0x10000 ) & 0x3FF ); - /* eslint-enable no-bitwise */ - - disjunction.push( uEsc( hi ) + uEsc( lo ) ); - continue; } // Handle interval @@ -196,11 +195,11 @@ if ( max <= 0xFFFF ) { // interval is entirely BMP characterClass.push( codeUnitRange( min, max ) ); - } else if ( min <= 0xFFFF && max > 0xFFFF ) { + } else if ( min <= 0xFFFF ) { // interval is BMP and non-BMP characterClass.push( codeUnitRange( min, 0xFFFF ) ); boxes = getCodeUnitBoxes( 0x10000, max ); - } else if ( min > 0xFFFF ) { + } else { // interval is entirely non-BMP boxes = getCodeUnitBoxes( min, max ); } @@ -299,44 +298,31 @@ * @param {string} text Text */ unicodeJS.TextString = function UnicodeJSTextString( text ) { - this.clusters = unicodeJS.graphemebreak.splitClusters( text ); + this.codepoints = unicodeJS.splitCharacters( text ); }; /* Methods */ /** - * Read grapheme cluster at specified position + * Read unicode codepoint at specified position * * @method * @param {number} position Position to read from - * @return {string|null} Grapheme cluster, or null if out of bounds + * @return {string|null} Unicode codepoint, or null if out of bounds */ unicodeJS.TextString.prototype.read = function ( position ) { - var clusterAt = this.clusters[ position ]; - return clusterAt !== undefined ? clusterAt : null; + var codepointAt = this.codepoints[ position ]; + return codepointAt !== undefined ? codepointAt : null; }; /** - * Return number of grapheme clusters in the text string + * Return number of codepoints in the text string * * @method - * @return {number} Number of grapheme clusters + * @return {number} Number of codepoints */ unicodeJS.TextString.prototype.getLength = function () { - return this.clusters.length; -}; - -/** - * Return a sub-TextString - * - * @param {number} start Start offset - * @param {number} end End offset - * @return {unicodeJS.TextString} New TextString object containing substring - */ -unicodeJS.TextString.prototype.substring = function ( start, end ) { - var textString = new unicodeJS.TextString( '' ); - textString.clusters = this.clusters.slice( start, end ); - return textString; + return this.codepoints.length; }; /** @@ -344,8 +330,8 @@ * * @return {string} Plain javascript string */ -unicodeJS.TextString.prototype.getString = function () { - return this.clusters.join( '' ); +unicodeJS.TextString.prototype.toString = function () { + return this.codepoints.join( '' ); }; // This file is GENERATED by tools/unicodejs-properties.js @@ -518,34 +504,23 @@ } /** - * Return the wordbreak property value for the cluster - * - * This is a slight con, because Unicode wordbreak property values are defined - * per character, not per cluster, whereas we're already working with a string - * split into clusters. - * - * We are making a working assumption that we can implement the Unicode - * word boundary specification by taking the property value of the *first* - * character of the cluster. In particular, this implements WB4 for us, because - * non-initial Extend or Format characters disappear. + * Return the wordbreak property value for the codepoint * * See http://www.unicode.org/reports/tr29/#Word_Boundaries * * @private - * @param {string} cluster The grapheme cluster - * @return {string|null} The unicode wordbreak property value + * @param {string} codepoint The codepoint + * @return {string|null} The unicode wordbreak property value (key of unicodeJS.wordbreakproperties) */ - function getProperty( cluster ) { - var character, property; - // cluster is always converted to a string by RegExp#test + function getProperty( codepoint ) { + // codepoint is always converted to a string by RegExp#test // e.g. null -> 'null' and would match /[a-z]/ // so return null for any non-string value - if ( typeof cluster !== 'string' ) { + if ( typeof codepoint !== 'string' ) { return null; } - character = unicodeJS.splitCharacters( cluster )[ 0 ]; for ( property in patterns ) { - if ( patterns[ property ].test( character ) ) { + if ( patterns[ property ].test( codepoint ) ) { return property; } } @@ -614,14 +589,15 @@ /** * Evaluates whether a position within some text is a word boundary. * - * The text object elements may be code units, codepoints or clusters. + * The text object elements may be codepoints or code units (deprecated) * - * @param {Object} string TextString-like object with read( pos ) returning string|null + * @param {unicodeJS.TextString} string TextString * @param {number} pos Character position * @return {boolean} Is the position a word boundary */ wordbreak.isBreak = function ( string, pos ) { - var lft = [], + var nextRgt, nextLft, + lft = [], rgt = [], l = 0, r = 0; @@ -633,10 +609,11 @@ return true; } + // Compatibility with TextString objects that split codepoints // Do not break inside surrogate pair if ( - string.read( pos - 1 ).match( /[\uD800-\uDBFF]/ ) && - string.read( pos ).match( /[\uDC00-\uDFFF]/ ) + string.read( pos - 1 ).match( /^[\uD800-\uDBFF]$/ ) && + string.read( pos ).match( /^[\uDC00-\uDFFF]$/ ) ) { return false; } @@ -669,7 +646,7 @@ // We've reached the end of an Extend|Format sequence, collapse it while ( lft[ 0 ] === 'Extend' || lft[ 0 ] === 'Format' ) { l++; - if ( pos - l - 1 <= 0 ) { + if ( pos - l - 1 < 0 ) { // start of document return true; } @@ -685,11 +662,17 @@ return false; } - // some tests beyond this point require more context - l++; - r++; - rgt.push( getProperty( string.read( pos + r ) ) ); - lft.push( getProperty( string.read( pos - l - 1 ) ) ); + // Some tests beyond this point require more context, as per WB4 ignore Format and Extend. + do { + r++; + nextRgt = getProperty( string.read( pos + r ) ); + } while ( nextRgt === 'Extend' || nextRgt === 'Format' ); + rgt.push( nextRgt ); + do { + l++; + nextLft = getProperty( string.read( pos - l - 1 ) ); + } while ( nextLft === 'Extend' || nextLft === 'Format' ); + lft.push( nextLft ); switch ( true ) { // Do not break letters across certain punctuation. -- To view, visit https://gerrit.wikimedia.org/r/391612 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ica02f42216d8deb968684f43ccc5a493f4c7a368 Gerrit-PatchSet: 1 Gerrit-Project: VisualEditor/VisualEditor Gerrit-Branch: master Gerrit-Owner: Jforrester <[email protected]> Gerrit-Reviewer: Catrope <[email protected]> Gerrit-Reviewer: Esanders <[email protected]> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
