Divec has uploaded a new change for review.
https://gerrit.wikimedia.org/r/68864
Change subject: splitClusters uses Grapheme Cluster Boundary rules
......................................................................
splitClusters uses Grapheme Cluster Boundary rules
unicodejs.graphemebreak.js
* New file: singleton class with splitClusters method
* On load, builds graphemeBreakRegexp from unicodejs.graphemebreakproperties.js
unicodejs.js
* Remove old splitClusters method (was just a placeholder)
* Change "conjunction" -> "disjunction", for consistency and correctness
unicodejs.textstring.js
* Use new splitClusters method
modules/ve/ve.js
* Use new splitClusters method
unicodejs.wordbreak.text.js
* Add new splitClusters test
* Refactor charRangeArrayRegexp test to use splitClusters
index.php
* add unicodejs.graphemebreak.js, unicodejs.graphemebreakproperties.js
VisualEditor.php
* add unicodejs.graphemebreak.js, unicodejs.graphemebreakproperties.js
demos/ve/index.php
* add unicodejs.graphemebreak.js, unicodejs.graphemebreakproperties.js
maintenance/makeStaticLoader.php
* add unicodejs.graphemebreak.js, unicodejs.graphemebreakproperties.js
modules/ve/test/index.php
* add unicodejs.graphemebreak.js, unicodejs.graphemebreakproperties.js
.docs/categories.json
* add unicodeJS.wordbreak class
Change-Id: I8f512e2fc2c46eb4b5f00994a8dac88f3c8f7dd2
---
M .docs/categories.json
M VisualEditor.php
M demos/ve/index.php
M maintenance/makeStaticLoader.php
M modules/unicodejs/index.php
A modules/unicodejs/unicodejs.graphemebreak.js
M modules/unicodejs/unicodejs.js
M modules/unicodejs/unicodejs.textstring.js
M modules/unicodejs/unicodejs.wordbreak.test.js
M modules/ve/test/index.php
M modules/ve/ve.js
11 files changed, 164 insertions(+), 65 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/VisualEditor
refs/changes/64/68864/1
diff --git a/.docs/categories.json b/.docs/categories.json
index 2601503..0efd230 100644
--- a/.docs/categories.json
+++ b/.docs/categories.json
@@ -173,7 +173,7 @@
{
"name": "UnicodeJS",
"classes": [
- "unicodeJS", "unicodeJS.TextString",
"unicodeJS.wordbreak"
+ "unicodeJS", "unicodeJS.TextString",
"unicodeJS.wordbreak", "unicodeJS.graphemebreak"
]
}
]
diff --git a/VisualEditor.php b/VisualEditor.php
index b3bcfd6..f7356a8 100644
--- a/VisualEditor.php
+++ b/VisualEditor.php
@@ -85,6 +85,8 @@
'scripts' => array(
'unicodejs/unicodejs.js',
'unicodejs/unicodejs.textstring.js',
+ 'unicodejs/unicodejs.graphemebreakproperties.js',
+ 'unicodejs/unicodejs.graphemebreak.js',
'unicodejs/unicodejs.wordbreakproperties.js',
'unicodejs/unicodejs.wordbreak.js',
),
diff --git a/demos/ve/index.php b/demos/ve/index.php
index a1d3d34..88ab824 100644
--- a/demos/ve/index.php
+++ b/demos/ve/index.php
@@ -85,6 +85,8 @@
<script
src="../../modules/rangy/rangy-position-1.3.js"></script>
<script src="../../modules/unicodejs/unicodejs.js"></script>
<script
src="../../modules/unicodejs/unicodejs.textstring.js"></script>
+ <script
src="../../modules/unicodejs/unicodejs.graphemebreakproperties.js"></script>
+ <script
src="../../modules/unicodejs/unicodejs.graphemebreak.js"></script>
<script
src="../../modules/unicodejs/unicodejs.wordbreakproperties.js"></script>
<script
src="../../modules/unicodejs/unicodejs.wordbreak.js"></script>
<!-- ext.visualEditor.base#standalone-init -->
diff --git a/maintenance/makeStaticLoader.php b/maintenance/makeStaticLoader.php
index 7478d3c..7054855 100644
--- a/maintenance/makeStaticLoader.php
+++ b/maintenance/makeStaticLoader.php
@@ -89,6 +89,8 @@
'rangy/rangy-position-1.3.js',
'unicodejs/unicodejs.js',
'unicodejs/unicodejs.textstring.js',
+
'unicodejs/unicodejs.graphemebreakproperties.js',
+ 'unicodejs/unicodejs.graphemebreak.js',
'unicodejs/unicodejs.wordbreakproperties.js',
'unicodejs/unicodejs.wordbreak.js',
),
diff --git a/modules/unicodejs/index.php b/modules/unicodejs/index.php
index 7fc9a0f..121764e 100644
--- a/modules/unicodejs/index.php
+++ b/modules/unicodejs/index.php
@@ -21,6 +21,8 @@
<script src="../jquery/jquery.js"></script>
<script src="unicodejs.js"></script>
<script src="unicodejs.textstring.js"></script>
+ <script src="unicodejs.graphemebreakproperties.js"></script>
+ <script src="unicodejs.graphemebreak.js"></script>
<script src="unicodejs.wordbreakproperties.js"></script>
<script src="unicodejs.wordbreak.js"></script>
diff --git a/modules/unicodejs/unicodejs.graphemebreak.js
b/modules/unicodejs/unicodejs.graphemebreak.js
new file mode 100644
index 0000000..8b5bbea
--- /dev/null
+++ b/modules/unicodejs/unicodejs.graphemebreak.js
@@ -0,0 +1,79 @@
+/*!
+ * Graphemebreak module
+ *
+ * Implementation of grapheme cluster boundary detection, based on
+ * Unicode UAX #29 Default Grapheme Cluster Boundary Specification; see
+ * http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
+ *
+ * @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
+ * @license The MIT License (MIT); see LICENSE.txt
+ */
+( function () {
+ var property, disjunction = [],
+ graphemeBreakRegexp,
+ properties = unicodeJS.graphemebreakproperties,
+ // Single unicode character (either a UTF-16 code unit or a
surrogate pair)
+ oneCharacter =
'[^\\ud800-\\udfff]|[\\ud800-\\udbff][\\udc00-\\udfff]',
+ /**
+ * @class unicodeJS.graphemebreak
+ * @singleton
+ */
+ graphemebreak = unicodeJS.graphemebreak = {},
+ patterns = {};
+
+ // build regexes
+ for ( property in properties ) {
+ patterns[property] = unicodeJS.charRangeArrayRegexp(
properties[property] );
+ }
+
+ // build disjunction for grapheme cluster split
+ disjunction.push( '\\r\\n' );
+ disjunction.push( patterns.Control );
+ disjunction.push(
+ '(?:' + patterns.L + ')*' +
+ '(?:' + patterns.V + ')+' +
+ '(?:' + patterns.T + ')*'
+ );
+ disjunction.push(
+ '(?:' + patterns.L + ')*' +
+ '(?:' + patterns.LV + ')' +
+ '(?:' + patterns.V + ')*' +
+ '(?:' + patterns.T + ')*'
+ );
+ disjunction.push(
+ '(?:' + patterns.L + ')*' +
+ '(?:' + patterns.LVT + ')' +
+ '(?:' + patterns.T + ')*'
+ );
+ disjunction.push( '(?:' + patterns.L + ')+' );
+ disjunction.push( '(?:' + patterns.T + ')+' );
+ // jshint camelcase: false
+ disjunction.push( '(?:' + patterns.Regional_Indicator + ')+' );
+ // jshint camelcase: true
+ disjunction.push(
+ // TODO: this will break if the extended thing is not
oneCharacter
+ // e.g. hangul jamo L+V+T. Does it matter?
+ '(?:' + oneCharacter + ')' +
+ '(?:' + patterns.Extend + '|' + patterns.SpacingMark + ')+'
+ );
+ disjunction.push( oneCharacter ); // any other
+ graphemeBreakRegexp = new RegExp( '(' + disjunction.join( '|' ) + ')' );
+
+ /**
+ * Split a string into grapheme clusters.
+ *
+ * @param {string} text Text to split
+ * @returns {string[]} Array of clusters
+ */
+ graphemebreak.splitClusters = function ( text ) {
+ var i, parts, length, clusters = [];
+ parts = text.split( graphemeBreakRegexp );
+ length = parts.length;
+ for ( i = 0; i < length; i++ ) {
+ if ( parts[i] !== '' ) {
+ clusters.push( parts[i] );
+ }
+ }
+ return clusters;
+ };
+}() );
diff --git a/modules/unicodejs/unicodejs.js b/modules/unicodejs/unicodejs.js
index 1515389..e513ffb 100644
--- a/modules/unicodejs/unicodejs.js
+++ b/modules/unicodejs/unicodejs.js
@@ -16,19 +16,9 @@
unicodeJS = {};
/**
- * Split a string into grapheme clusters.
- *
- * @param {string} text Text to split
- * @returns {string[]} Array of clusters
- */
- unicodeJS.splitClusters = function ( text ) {
- return text.split( /(?![\uDC00-\uDFFF\u0300-\u036F])/g );
- // kludge: for now, just don't split UTF surrogate pairs or
combining accents
- // TODO: implement Grapheme boundary rules
- };
-
- /**
* Split a string into Unicode characters, keeping surrogates paired.
+ *
+ * You probably want to call unicodeJS.graphemebreak.splitClusters
instead.
*
* @param {string} text Text to split
* @returns {string[]} Array of characters
@@ -78,7 +68,7 @@
*
* Suppose ch1 and ch2 have surrogate pairs (hi1, lo1) and (hi2, lo2).
* Then the range of chars from ch1 to ch2 can be represented as the
- * conjunction of three code unit ranges:
+ * disjunction of three code unit ranges:
*
* [hi1 - hi1][lo1 - 0xDFFF]
* |
@@ -148,7 +138,7 @@
unicodeJS.charRangeArrayRegexp = function( ranges ) {
var i, j, min, max, hi, lo, range, box, boxes,
characterClass = [], // list of (\uXXXX code unit or
interval), for BMP
- conjunction = []; // list of regex strings, to be
joined with '|'
+ disjunction = []; // list of regex strings, to be
joined with '|'
for ( i = 0; i < ranges.length; i++ ) {
range = ranges[i];
@@ -171,7 +161,7 @@
hi = 0xD800 + ( ( range - 0x10000 ) >> 10 );
lo = 0xDC00 + ( ( range - 0x10000 ) & 0x3FF );
/* jslint bitwise: false */
- conjunction.push( uEsc( hi ) + uEsc( lo ) );
+ disjunction.push( uEsc( hi ) + uEsc( lo ) );
continue;
}
@@ -208,17 +198,17 @@
box = boxes[j];
hi = codeUnitRange( box.hi[0], box.hi[1], true
);
lo = codeUnitRange( box.lo[0], box.lo[1], true
);
- conjunction.push( hi + lo );
+ disjunction.push( hi + lo );
}
}
- // prepend BMP character class to the conjunction
+ // prepend BMP character class to the disjunction
if ( characterClass.length === 1 && !
characterClass[0].match(/-/) ) {
- conjunction.unshift( characterClass[0] ); // single
character
+ disjunction.unshift( characterClass[0] ); // single
character
} else if ( characterClass.length > 0 ) {
- conjunction.unshift( '[' + characterClass.join( '' ) +
']' );
+ disjunction.unshift( '[' + characterClass.join( '' ) +
']' );
}
- return conjunction.join( '|' );
+ return disjunction.join( '|' );
};
// Expose
diff --git a/modules/unicodejs/unicodejs.textstring.js
b/modules/unicodejs/unicodejs.textstring.js
index f9bf2aa..93cdfa8 100644
--- a/modules/unicodejs/unicodejs.textstring.js
+++ b/modules/unicodejs/unicodejs.textstring.js
@@ -16,7 +16,7 @@
* @param {string} text Text
*/
unicodeJS.TextString = function UnicodeJSTextString( text ) {
- this.clusters = unicodeJS.splitClusters( text );
+ this.clusters = unicodeJS.graphemebreak.splitClusters( text );
};
/* Methods */
diff --git a/modules/unicodejs/unicodejs.wordbreak.test.js
b/modules/unicodejs/unicodejs.wordbreak.test.js
index 40b2029..091bc79 100644
--- a/modules/unicodejs/unicodejs.wordbreak.test.js
+++ b/modules/unicodejs/unicodejs.wordbreak.test.js
@@ -7,6 +7,33 @@
QUnit.module( 'unicodeJS.wordbreak' );
+QUnit.test( 'splitClusters', function ( assert ) {
+ var expected = [
+ 'a',
+ ' ',
+ ' ',
+ 'b',
+ 'カ',
+ 'タ',
+ 'カ',
+ 'ナ',
+ 'c\u0300\u0327', // c with two combining chars
+ '\ud800\udf08', // U+10308 OLD ITALIC LETTER THE
+ '\ud800\udf08\u0302', // U+10308 + combining circumflex
+ '\r\n',
+ '\n',
+ '\u1104\u1173', // jamo L+V
+ '\u1105\u1161\u11a8', // jamo L+V+T
+ '\ud83c\udded\ud83c\uddf0' // 2*regional indicator characters
+ ];
+ QUnit.expect( 1 );
+ assert.deepEqual( // assert.equal fails
+ unicodeJS.graphemebreak.splitClusters( expected.join( '' ) ),
+ expected,
+ 'Split clusters'
+ );
+});
+
QUnit.test( 'charRangeArrayRegexp', function ( assert ) {
var i, test, doTestFunc, equalityTests, throwTests;
@@ -128,59 +155,52 @@
});
QUnit.test( 'isBreak', function ( assert ) {
- var i, result, context,
- text =
- /*jshint quotmark:double */
- // 0 - 9
- "\u0300xyz'd a' " +
- // 10 - 19
- "'a a-b 1a\r" +
- // 20 - 29
- "\nカタカナ3,1.2" +
- // 30 - 39
- " a_b_3_ナ_ " +
- // 40 - 49
- "汉字/漢字 c\u0300\u0327k " +
- // 50 - 59
- "\ud800\udf08" + // U+10308 OLD ITALIC LETTER THE
- "\ud800\udf08\u0302" + // U+10308 OLD ITALIC LETTER THE
+ combining circumflex
- "\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
- " pad " +
- "\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
- "\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
- // 60 - 69
- " 뜨락또르 트랙터 " + // hangul (not decomposed into jamo)
- //// TODO: test the equivalent text in jamo when
graphemebreak rules work
- ////
"\u1104\u1173\u1105\u1161\u11a8\u1104\u1169\u1105\u1173 " +
- //// "\u1110\u1173\u1105\u1162\u11a8\u1110\u1165" +
- // 70 - 75: "a." tests end of para
- " c\u0300\u0327 a.",
- /*jshint quotmark:single */
- textString = new unicodeJS.TextString( text ),
- breaks = [
- 0, 1, 6, 7, 8, 9, 10,
- 11, 12, 13, 14, 15, 16, 17, 19,
- 21, 25, 30,
- 31, 39, 40,
- 41, 42, 43, 44, 45, 46, 48, 49, 50,
- 53, 54, 57, 58, 60,
- 61, 65, 66, 69, 70,
- 71, 72, 73, 74, 75
+ var i, pos, result, context, breakOffsets, textString,
+ broken = [
+ '\u0300', 'xyz\'d', ' ', 'a', '\'', ' ',
+ '\'', 'a', ' ', 'a', '-', 'b', ' ', '1a', '\r\n',
+ 'カタカナ', '3,1.2', ' ',
+ 'a_b_3_ナ_', ' ',
+ '汉', '字', '/', '漢', '字', ' ',
+ 'c\u0300\u0327k', ' ',
+ // Test ALetter characters above U+FFFF.
+ // ALetter+ should be a single word
+ // (ALetter Extend*)+ should be a single word
+ //
+ // We'll use:
+ // U+10308 OLD ITALIC LETTER THE \ud800\udf08
+ // U+1030A OLD ITALIC LETTER KA \ud800\udf0a
+ // U+0302 COMBINING CIRCUMFLEX \u0302
+ '\ud800\udf08' + '\ud800\udf08\u0302' + '\ud800\udf0a',
+ ' ',
+ '\ud800\udf0a' + '\ud800\udf0a',
+ ' ', '뜨락또르', ' ', '트랙터', ' ', // hangul (composed)
+ //// TODO: test the equivalent hangul decomposed into
jamo
+ ////
'\u1104\u1173\u1105\u1161\u11a8\u1104\u1169\u1105\u1173 ' +
+ //// '\u1110\u1173\u1105\u1162\u11a8\u1110\u1165' +
+ ' ', 'c\u0300\u0327', ' ', 'a', '.'
];
+ breakOffsets = [0];
+ pos = 0;
+ for ( i = 0; i < broken.length; i++ ) {
+ pos += unicodeJS.graphemebreak.splitClusters( broken[i]
).length;
+ breakOffsets.push( pos );
+ }
+ textString = new unicodeJS.TextString( broken.join( '' ) ),
QUnit.expect( textString.getLength() + 1 );
for ( i = 0; i <= textString.getLength(); i++ ) {
- result = ( breaks.indexOf( i ) !== -1 );
+ result = ( breakOffsets.indexOf( i ) !== -1 );
context =
textString.substring( Math.max( i - 4, 0 ), i
).getString() +
'│' +
- textString.substring( i, Math.min( i + 4, text.length )
).getString()
+ textString.substring( i, Math.min( i + 4,
textString.getLength() ) ).getString()
;
assert.equal(
unicodeJS.wordbreak.isBreak( textString, i ),
result,
- 'Break at position ' + i + ': ' + context
+ 'Break at position ' + i + ' (expect ' + result + '): '
+ context
);
}
});
diff --git a/modules/ve/test/index.php b/modules/ve/test/index.php
index ac64440..ff06cec 100644
--- a/modules/ve/test/index.php
+++ b/modules/ve/test/index.php
@@ -38,6 +38,8 @@
<script src="../../rangy/rangy-position-1.3.js"></script>
<script src="../../unicodejs/unicodejs.js"></script>
<script src="../../unicodejs/unicodejs.textstring.js"></script>
+ <script
src="../../unicodejs/unicodejs.graphemebreakproperties.js"></script>
+ <script
src="../../unicodejs/unicodejs.graphemebreak.js"></script>
<script
src="../../unicodejs/unicodejs.wordbreakproperties.js"></script>
<script src="../../unicodejs/unicodejs.wordbreak.js"></script>
<!-- ext.visualEditor.base#standalone-init -->
diff --git a/modules/ve/ve.js b/modules/ve/ve.js
index d86ecd2..c033167 100644
--- a/modules/ve/ve.js
+++ b/modules/ve/ve.js
@@ -527,9 +527,9 @@
};
/**
- * @see unicodeJS#splitClusters
+ * @see unicodeJS.graphemebreak#splitClusters
*/
- ve.splitClusters = unicodeJS.splitClusters;
+ ve.splitClusters = unicodeJS.graphemebreak.splitClusters;
/**
* Determine if the text consists of only unattached combining marks
--
To view, visit https://gerrit.wikimedia.org/r/68864
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I8f512e2fc2c46eb4b5f00994a8dac88f3c8f7dd2
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/VisualEditor
Gerrit-Branch: master
Gerrit-Owner: Divec <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits