Divec has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/124287

Change subject: DONT MERGE: Proof of concept linear segmenter
......................................................................

DONT MERGE: Proof of concept linear segmenter

Use "linearisation" trick borrowed from VE to flatten inline content (c.f.
http://www.mediawiki.org/wiki/VisualEditor/Software_design#Data_Structures).
The resulting marked-up "linear text" is much easier to segment validly.

goSegmentation.js:
* Proof-of-concept SAX lineariser

enSegmenter.js
* Proof-of-concept plaintext segmenter, for SAX lineariser to call

TODO:
* Make references, inline images etc widthless (attach to a zero-length string),
  so language-specific segmenters don't have to worry about "[5]" etc.

* Hook up lineariser plaintext segmenter

* Output delinearised segmented HTML (tags will balance automatically)

* Allow separate, subordinate linear text within references, image captions etc.

Change-Id: I0b53b87f60494b49ad8b0fb7654dd859f6f97045
---
A enSegmenter.js
A goSegmentation.js
2 files changed, 213 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/87/124287/1

diff --git a/enSegmenter.js b/enSegmenter.js
new file mode 100644
index 0000000..5459d11
--- /dev/null
+++ b/enSegmenter.js
@@ -0,0 +1,71 @@
+/**
+ * Find all matches of regex in text, calling callback with each match object
+ * @private
+ * @param {string} text The text to search
+ * @param {Regex} regex The regex to search; should be created for this 
function call
+ * @param {Function} callback Function to call with each match
+ * @returns {Array} The return values from the callback
+ */
+function findAll ( text, regex, callback ) {
+       var match, boundary,
+               boundaries = [];
+       while ( true ) {
+               match = regex.exec( text );
+               if ( match === null ) {
+                       break;
+               }
+               boundary = callback( text, match );
+               if ( boundary !== null ) {
+                       boundaries.push( boundary );
+               }
+       }
+       return boundaries;
+}
+
+/**
+ * Test a possible English sentence boundary match
+ *
+ * @param {string} text The plaintext to segment
+ * @param {Object} match The possible boundary match (returned by regex.exec)
+ * @return {number|null} The boundary offset, or null if not a sentence 
boundary
+ */
+function findBoundaryEn ( text, match ) {
+       var tail = text.slice( match.index + 1, text.length );
+       // Trailing non-final punctuation: not a sentence boundary
+       if ( tail.match( /^[,;:]/ ) ) {
+               return null;
+       }
+       // Next word character is number or lower-case: not a sentence boundary
+       if ( tail.match( /^\W*[0-9a-z]/ ) ) {
+               return null;
+       }
+       // Include any closing punctuation and trailing space
+       return match.index + 1 + tail.match( /^['”"’]*\s*/ )[0].length;
+}
+
+/**
+ * Find English sentence boundaries
+ *
+ * @param {string} text The plaintext to segment
+ * @returns {number[]} Sentence boundary offsets
+ */
+function getSegmentBoundariesEn ( text ) {
+       // Regex to find possible English sentence boundaries.
+       // Must not use a shared regex instance (re.lastIndex is used)
+       return findAll( text, /[.!?]/g, findBoundaryEn );
+}
+
+( function () {
+       var i, len, sample, boundaries, segment;
+       sample = 'Hello. "This is good." Only 2.5 people care though.';
+       boundaries = getSegmentBoundariesEn( sample );
+       console.log( 'sample:', sample );
+       console.log( 'boundaries:', boundaries );
+
+       // add in the sample start
+       boundaries.splice( 0, 0, 0 );
+       for ( i = 0, len = boundaries.length - 1; i < len; i++ ) {
+               segment = sample.substring( boundaries[i], boundaries[i + 1] );
+               console.log( 'segment:', JSON.stringify( segment ) );
+       }
+} () );
diff --git a/goSegmentation.js b/goSegmentation.js
new file mode 100644
index 0000000..ef51584
--- /dev/null
+++ b/goSegmentation.js
@@ -0,0 +1,142 @@
+'use strict';
+
+var SAXParser = require( 'sax' ).SAXParser,
+       util = require( 'util' ),
+       fs = require( 'fs' );
+
+/**
+ * Return the names only from an array of SAX open tags
+ *
+ * @private
+ * @param {object[]} tagArray SAX open tags
+ * @returns [string[]] Tag names
+ */
+function getTagNames ( tagArray ) {
+       var i, len, tagNames = [];
+       for ( i = 0, len = tagArray.length; i < len; i++ ) {
+               tagNames.push( tagArray[i].name );
+       }
+       return tagNames;
+}
+
+/**
+ * Determine whether a tag inside body is inline
+ *
+ * @private
+ * @param {string} bodyTagName The name of the tag (lowercase)
+ * @returns {boolean} Whether the tag is inline
+ */
+var isInline = ( function ( tagArray ) {
+       var i, len,
+               nonInlineTags = {};
+       for ( i = 0, len = tagArray.length; i < len; i++ ) {
+               nonInlineTags[tagArray[i]] = true;
+       }
+       return function ( bodyTagName ) {
+               return !nonInlineTags[bodyTagName];
+       };
+} ( [
+       'body',
+        'div', 'p',
+        // tables
+        'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td',
+        // lists
+        'ul', 'ol', 'li', 'dl', 'dt', 'dd',
+        // HTML5 heading content
+        'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup',
+        // HTML5 sectioning content
+        'article', 'aside', 'body', 'nav', 'section', 'footer', 'header', 
'figure',
+        'figcaption', 'fieldset', 'details', 'blockquote',
+        // other
+        'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed',
+        'map', 'object', 'pre', 'progress', 'video'
+] ) );
+
+function CXParser () {
+       SAXParser.call( this, false, { lowercase: true } );
+}
+util.inherits( CXParser, SAXParser );
+
+/**
+ * Parser: linearize Parsoid HTML
+ * @class
+ */
+CXParser.prototype.init = function () {
+       this.blockTagStack = [];
+       this.inlineTagStack = [];
+       this.linearText = [];
+       this.linearTags = [];
+       this.content = [];
+};
+
+CXParser.prototype.onopentag = function ( tag ) {
+       if (
+               this.blockTagStack.length >= 2 &&
+               this.blockTagStack[1].name === 'body' &&
+               isInline( tag.name )
+       ) {
+               this.inlineTagStack.push( tag );
+       } else {
+               if ( this.inlineTagStack.length > 0 ) {
+                       throw new Error(
+                               'Block tag <' + tag.name + '> inside inline tag 
<' +
+                               this.inlineTagStack[this.inlineTagStack.length 
- 1].name + '>'
+                       );
+               }
+               if ( this.linearText.length > 0 ) {
+                       this.finishLinearText();
+               }
+               this.blockTagStack.push( tag );
+               // XXX do attributes
+               this.content.push( '<' + tag.name + '>' );
+       }
+};
+
+CXParser.prototype.onclosetag = function ( tagName ) {
+       if ( this.inlineTagStack.length > 0 ) {
+               this.inlineTagStack.pop();
+       } else {
+               if ( this.linearText.length > 0 ) {
+                       this.finishLinearText();
+               }
+               this.blockTagStack.pop();
+               this.content.push( '</' + tagName + '>\n' );
+       }
+};
+
+CXParser.prototype.ontext = function ( text ) {
+       if ( !text.trim() && this.linearText.length === 0 ) {
+               this.content.push( text );
+       } else {
+               this.linearText.push( text );
+               this.linearTags.push( this.inlineTagStack.slice() );
+       }
+};
+
+CXParser.prototype.finishLinearText = function () {
+       var i, len;
+       this.content.push( '\n[\n' );
+       for ( i = 0, len = this.linearText.length; i < len; i++ ) {
+               this.content.push(
+                       '\t' +
+                       JSON.stringify( {
+                               text: this.linearText[i],
+                               tagStack: getTagNames( this.linearTags[i] )
+                       } ) +
+                       ( i === len ? '\n' : ',\n' )
+               );
+       }
+       this.content.push( ']\n' );
+       this.linearText.length = 0;
+       this.linearTags.length = 0;
+};
+
+var parser = new CXParser();
+parser.init();
+fs.readFile( 'Food.html', 'utf8', function ( err, data ) {
+       if ( err ) {
+               return console.log( err );
+       }
+       parser.write( data );
+       process.stdout.write( parser.content.join( '' ) );
+} );

-- 
To view, visit https://gerrit.wikimedia.org/r/124287
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0b53b87f60494b49ad8b0fb7654dd859f6f97045
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Divec <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to