Esanders has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/391820 )
Change subject: Go back to storing strings as code units
......................................................................
Go back to storing strings as code units
Instead implement next/prevCodepoint methods so we can
iterate over the data structure as if it were codepoints.
Change-Id: If30035a6f63397ed2e583922200299ee228249c9
---
M src/unicodejs.js
M src/unicodejs.textstring.js
M src/unicodejs.wordbreak.js
M tests/unicodejs.test.js
M tests/unicodejs.wordbreak.test.js
5 files changed, 95 insertions(+), 50 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/unicodejs refs/changes/20/391820/1
diff --git a/src/unicodejs.js b/src/unicodejs.js
index cba801c..404c900 100644
--- a/src/unicodejs.js
+++ b/src/unicodejs.js
@@ -29,6 +29,26 @@
};
/**
+ * Check if a code unit is a the leading half of a surrogate pair
+ *
+ * @param {string} unit Code unit
+ * @return {boolean}
+ */
+ unicodeJS.isLeadingSurrogate = function ( unit ) {
+ return unit && unit.match( /^[\uD800-\uDBFF]$/ );
+ };
+
+ /**
+ * Check if a code unit is a the trailing half of a surrogate pair
+ *
+ * @param {string} unit Code unit
+ * @return {boolean}
+ */
+ unicodeJS.isTrailingSurrogate = function ( unit ) {
+ return unit && unit.match( /^[\uDC00-\uDFFF]$/ );
+ };
+
+ /**
* Write a UTF-16 code unit as a javascript string literal.
*
* @private
diff --git a/src/unicodejs.textstring.js b/src/unicodejs.textstring.js
index 99246cb..038fca2 100644
--- a/src/unicodejs.textstring.js
+++ b/src/unicodejs.textstring.js
@@ -16,7 +16,7 @@
* @param {string} text Text
*/
unicodeJS.TextString = function UnicodeJSTextString( text ) {
- this.codepoints = unicodeJS.splitCharacters( text );
+ this.text = text;
};
/* Methods */
@@ -29,18 +29,39 @@
* @return {string|null} Unicode codepoint, or null if out of bounds
*/
unicodeJS.TextString.prototype.read = function ( position ) {
- var codepointAt = this.codepoints[ position ];
- return codepointAt !== undefined ? codepointAt : null;
+ var dataAt = this.text[ position ];
+ return dataAt !== undefined ? dataAt : null;
};
-/**
- * Return number of codepoints in the text string
- *
- * @method
- * @return {number} Number of codepoints
- */
-unicodeJS.TextString.prototype.getLength = function () {
- return this.codepoints.length;
+unicodeJS.TextString.prototype.nextCodepoint = function ( position ) {
+ var trailing,
+ codepoint = this.read( position );
+
+ if ( unicodeJS.isLeadingSurrogate( codepoint ) ) {
+ trailing = this.read( position + 1 );
+ if ( unicodeJS.isTrailingSurrogate( trailing ) ) {
+ codepoint += trailing;
+ }
+ }
+ return codepoint;
+};
+
+unicodeJS.TextString.prototype.prevCodepoint = function ( position ) {
+ var leading,
+ codepoint = this.read( position - 1 );
+
+ if ( unicodeJS.isTrailingSurrogate( codepoint ) ) {
+ leading = this.read( position - 2 );
+ if ( unicodeJS.isLeadingSurrogate( leading ) ) {
+ codepoint = leading + codepoint;
+ }
+ }
+ return codepoint;
+};
+
+unicodeJS.TextString.prototype.isMidSurrogate = function ( position ) {
+ return unicodeJS.isLeadingSurrogate( this.read( position - 1 ) ) &&
+ unicodeJS.isTrailingSurrogate( this.read( position ) );
};
/**
@@ -49,5 +70,5 @@
* @return {string} Plain javascript string
*/
unicodeJS.TextString.prototype.toString = function () {
- return this.codepoints.join( '' );
+ return this.text;
};
diff --git a/src/unicodejs.wordbreak.js b/src/unicodejs.wordbreak.js
index ea1e0e3..a3a3c3d 100644
--- a/src/unicodejs.wordbreak.js
+++ b/src/unicodejs.wordbreak.js
@@ -110,14 +110,14 @@
/**
* Evaluates whether a position within some text is a word boundary.
*
- * The text object elements may be codepoints or code units (deprecated)
+ * The text object elements may be codepoints or code units
*
* @param {unicodeJS.TextString} string TextString
* @param {number} pos Character position
* @return {boolean} Is the position a word boundary
*/
wordbreak.isBreak = function ( string, pos ) {
- var nextRgt, nextLft,
+ var nextCodepoint, prevCodepoint, nextProperty, prevProperty,
lft = [],
rgt = [],
l = 0,
@@ -130,18 +130,18 @@
return true;
}
- // Compatibility with TextString objects that split codepoints
// Do not break inside surrogate pair
- if (
- string.read( pos - 1 ).match( /^[\uD800-\uDBFF]$/ ) &&
- string.read( pos ).match( /^[\uDC00-\uDFFF]$/ )
- ) {
+ if ( string.isMidSurrogate( pos ) ) {
return false;
}
- // get some context
- rgt.push( getProperty( string.read( pos + r ) ) );
- lft.push( getProperty( string.read( pos - l - 1 ) ) );
+ // Get some context
+ nextCodepoint = string.nextCodepoint( pos + r );
+ prevCodepoint = string.prevCodepoint( pos - l );
+ rgt.push( getProperty( nextCodepoint ) );
+ lft.push( getProperty( prevCodepoint ) );
+ r += nextCodepoint.length;
+ l += prevCodepoint.length;
switch ( true ) {
// Do not break within CRLF.
@@ -166,12 +166,13 @@
}
// We've reached the end of an Extend|Format sequence, collapse
it
while ( lft[ 0 ] === 'Extend' || lft[ 0 ] === 'Format' ) {
- l++;
if ( pos - l - 1 < 0 ) {
// start of document
return true;
}
- lft[ lft.length - 1 ] = getProperty( string.read( pos -
l - 1 ) );
+ prevCodepoint = string.prevCodepoint( pos - l );
+ lft[ 0 ] = getProperty( prevCodepoint );
+ l += prevCodepoint.length;
}
// Do not break between most letters.
@@ -185,15 +186,25 @@
// Some tests beyond this point require more context, as per
WB4 ignore Format and Extend.
do {
- r++;
- nextRgt = getProperty( string.read( pos + r ) );
- } while ( nextRgt === 'Extend' || nextRgt === 'Format' );
- rgt.push( nextRgt );
+ nextCodepoint = string.nextCodepoint( pos + r );
+ if ( nextCodepoint === null ) {
+ nextProperty = null;
+ break;
+ }
+ r += nextCodepoint.length;
+ nextProperty = getProperty( nextCodepoint );
+ } while ( nextProperty === 'Extend' || nextProperty ===
'Format' );
+ rgt.push( nextProperty );
do {
- l++;
- nextLft = getProperty( string.read( pos - l - 1 ) );
- } while ( nextLft === 'Extend' || nextLft === 'Format' );
- lft.push( nextLft );
+ prevCodepoint = string.prevCodepoint( pos - l );
+ if ( prevCodepoint === null ) {
+ prevProperty = null;
+ break;
+ }
+ l += prevCodepoint.length;
+ prevProperty = getProperty( prevCodepoint );
+ } while ( prevProperty === 'Extend' || prevProperty ===
'Format' );
+ lft.push( prevProperty );
switch ( true ) {
// Do not break letters across certain punctuation.
diff --git a/tests/unicodejs.test.js b/tests/unicodejs.test.js
index 8cd53d1..472f0c8 100644
--- a/tests/unicodejs.test.js
+++ b/tests/unicodejs.test.js
@@ -18,12 +18,18 @@
data = line.split( ' ' );
data.forEach( function ( str, i ) {
+ var codepoint;
if ( i % 2 === 0 ) {
// Tests at even offsets
expected.push( breakMap[ str ] );
} else {
+ codepoint = +( '0x' + str );
// Chars at odd offsets
- chars += String.fromCodePoint( +( '0x' + str )
);
+ chars += String.fromCodePoint( codepoint );
+ // For surrogate pairs, add an expected
no-break between them
+ if ( codepoint > 0xFFFF ) {
+ expected.push( false );
+ }
}
} );
diff --git a/tests/unicodejs.wordbreak.test.js
b/tests/unicodejs.wordbreak.test.js
index 209496d..8c78c15 100644
--- a/tests/unicodejs.wordbreak.test.js
+++ b/tests/unicodejs.wordbreak.test.js
@@ -8,28 +8,16 @@
QUnit.module( 'unicodeJS.wordbreak' );
QUnit.test( 'Unicode test suite', function ( assert ) {
- var i, textString, result;
-
unicodeJS.testdata.wordbreak.reduce( unicodeJS.test.parseTestReduce, []
).forEach( function ( test ) {
+ var i,
+ textString = new unicodeJS.TextString( test.string ),
+ result = [];
- textString = new unicodeJS.TextString( test.string );
- result = [];
-
- for ( i = 0; i <= textString.getLength(); i++ ) {
+ for ( i = 0; i <= test.string.length; i++ ) {
result.push( unicodeJS.wordbreak.isBreak( textString, i
) );
}
assert.deepEqual( result, test.expected, test.msg );
} );
-
- textString = new unicodeJS.TextString( '' );
- textString.codepoints = '𨋢'.split( '' );
- result = [];
-
- for ( i = 0; i <= textString.getLength(); i++ ) {
- result.push( unicodeJS.wordbreak.isBreak( textString, i ) );
- }
- assert.deepEqual( result, [ true, false, true ], 'Basic support for
code unit splitting' );
-
} );
QUnit.test( 'nextBreakOffset/prevBreakOffset', function ( assert ) {
@@ -98,6 +86,5 @@
var plainString = 'abc𨋢def',
textString = new unicodeJS.TextString( plainString );
- assert.equal( textString.getLength(), 7, 'getLength' );
assert.equal( textString.toString(), plainString, 'toString' );
} );
--
To view, visit https://gerrit.wikimedia.org/r/391820
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: If30035a6f63397ed2e583922200299ee228249c9
Gerrit-PatchSet: 1
Gerrit-Project: unicodejs
Gerrit-Branch: master
Gerrit-Owner: Esanders <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits