Divec has uploaded a new change for review. https://gerrit.wikimedia.org/r/124287
Change subject: DONT MERGE: Proof of concept linear segmenter ...................................................................... DONT MERGE: Proof of concept linear segmenter Use "linearisation" trick borrowed from VE to flatten inline content (c.f. http://www.mediawiki.org/wiki/VisualEditor/Software_design#Data_Structures). The resulting marked-up "linear text" is much easier to segment validly. goSegmentation.js: * Proof-of-concept SAX lineariser enSegmenter.js * Proof-of-concept plaintext segmenter, for SAX lineariser to call TODO: * Make references, inline images etc widthless (attach to a zero-length string), so language-specific segmenters don't have to worry about "[5]" etc. * Hook up lineariser plaintext segmenter * Output delinearised segmented HTML (tags will balance automatically) * Allow separate, subordinate linear text within references, image captions etc. Change-Id: I0b53b87f60494b49ad8b0fb7654dd859f6f97045 --- A enSegmenter.js A goSegmentation.js 2 files changed, 213 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/87/124287/1 diff --git a/enSegmenter.js b/enSegmenter.js new file mode 100644 index 0000000..5459d11 --- /dev/null +++ b/enSegmenter.js @@ -0,0 +1,71 @@ +/** + * Find all matches of regex in text, calling callback with each match object + * @private + * @param {string} text The text to search + * @param {Regex} regex The regex to search; should be created for this function call + * @param {Function} callback Function to call with each match + * @returns {Array} The return values from the callback + */ +function findAll ( text, regex, callback ) { + var match, boundary, + boundaries = []; + while ( true ) { + match = regex.exec( text ); + if ( match === null ) { + break; + } + boundary = callback( text, match ); + if ( boundary !== null ) { + boundaries.push( boundary ); + } + } + return boundaries; +} + +/** + * Test a possible English sentence boundary match + * + * @param {string} text The plaintext to segment + * @param {Object} match The possible boundary match (returned by regex.exec) + * @return {number|null} The boundary offset, or null if not a sentence boundary + */ +function findBoundaryEn ( text, match ) { + var tail = text.slice( match.index + 1, text.length ); + // Trailing non-final punctuation: not a sentence boundary + if ( tail.match( /^[,;:]/ ) ) { + return null; + } + // Next word character is number or lower-case: not a sentence boundary + if ( tail.match( /^\W*[0-9a-z]/ ) ) { + return null; + } + // Include any closing punctuation and trailing space + return match.index + 1 + tail.match( /^['”"’]*\s*/ )[0].length; +} + +/** + * Find English sentence boundaries + * + * @param {string} text The plaintext to segment + * @returns {number[]} Sentence boundary offsets + */ +function getSegmentBoundariesEn ( text ) { + // Regex to find possible English sentence boundaries. + // Must not use a shared regex instance (re.lastIndex is used) + return findAll( text, /[.!?]/g, findBoundaryEn ); +} + +( function () { + var i, len, sample, boundaries, segment; + sample = 'Hello. "This is good." Only 2.5 people care though.'; + boundaries = getSegmentBoundariesEn( sample ); + console.log( 'sample:', sample ); + console.log( 'boundaries:', boundaries ); + + // add in the sample start + boundaries.splice( 0, 0, 0 ); + for ( i = 0, len = boundaries.length - 1; i < len; i++ ) { + segment = sample.substring( boundaries[i], boundaries[i + 1] ); + console.log( 'segment:', JSON.stringify( segment ) ); + } +} () ); diff --git a/goSegmentation.js b/goSegmentation.js new file mode 100644 index 0000000..ef51584 --- /dev/null +++ b/goSegmentation.js @@ -0,0 +1,142 @@ +'use strict'; + +var SAXParser = require( 'sax' ).SAXParser, + util = require( 'util' ), + fs = require( 'fs' ); + +/** + * Return the names only from an array of SAX open tags + * + * @private + * @param {object[]} tagArray SAX open tags + * @returns [string[]] Tag names + */ +function getTagNames ( tagArray ) { + var i, len, tagNames = []; + for ( i = 0, len = tagArray.length; i < len; i++ ) { + tagNames.push( tagArray[i].name ); + } + return tagNames; +} + +/** + * Determine whether a tag inside body is inline + * + * @private + * @param {string} bodyTagName The name of the tag (lowercase) + * @returns {boolean} Whether the tag is inline + */ +var isInline = ( function ( tagArray ) { + var i, len, + nonInlineTags = {}; + for ( i = 0, len = tagArray.length; i < len; i++ ) { + nonInlineTags[tagArray[i]] = true; + } + return function ( bodyTagName ) { + return !nonInlineTags[bodyTagName]; + }; +} ( [ + 'body', + 'div', 'p', + // tables + 'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td', + // lists + 'ul', 'ol', 'li', 'dl', 'dt', 'dd', + // HTML5 heading content + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup', + // HTML5 sectioning content + 'article', 'aside', 'body', 'nav', 'section', 'footer', 'header', 'figure', + 'figcaption', 'fieldset', 'details', 'blockquote', + // other + 'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed', + 'map', 'object', 'pre', 'progress', 'video' +] ) ); + +function CXParser () { + SAXParser.call( this, false, { lowercase: true } ); +} +util.inherits( CXParser, SAXParser ); + +/** + * Parser: linearize Parsoid HTML + * @class + */ +CXParser.prototype.init = function () { + this.blockTagStack = []; + this.inlineTagStack = []; + this.linearText = []; + this.linearTags = []; + this.content = []; +}; + +CXParser.prototype.onopentag = function ( tag ) { + if ( + this.blockTagStack.length >= 2 && + this.blockTagStack[1].name === 'body' && + isInline( tag.name ) + ) { + this.inlineTagStack.push( tag ); + } else { + if ( this.inlineTagStack.length > 0 ) { + throw new Error( + 'Block tag <' + tag.name + '> inside inline tag <' + + this.inlineTagStack[this.inlineTagStack.length - 1].name + '>' + ); + } + if ( this.linearText.length > 0 ) { + this.finishLinearText(); + } + this.blockTagStack.push( tag ); + // XXX do attributes + this.content.push( '<' + tag.name + '>' ); + } +}; + +CXParser.prototype.onclosetag = function ( tagName ) { + if ( this.inlineTagStack.length > 0 ) { + this.inlineTagStack.pop(); + } else { + if ( this.linearText.length > 0 ) { + this.finishLinearText(); + } + this.blockTagStack.pop(); + this.content.push( '</' + tagName + '>\n' ); + } +}; + +CXParser.prototype.ontext = function ( text ) { + if ( !text.trim() && this.linearText.length === 0 ) { + this.content.push( text ); + } else { + this.linearText.push( text ); + this.linearTags.push( this.inlineTagStack.slice() ); + } +}; + +CXParser.prototype.finishLinearText = function () { + var i, len; + this.content.push( '\n[\n' ); + for ( i = 0, len = this.linearText.length; i < len; i++ ) { + this.content.push( + '\t' + + JSON.stringify( { + text: this.linearText[i], + tagStack: getTagNames( this.linearTags[i] ) + } ) + + ( i === len ? '\n' : ',\n' ) + ); + } + this.content.push( ']\n' ); + this.linearText.length = 0; + this.linearTags.length = 0; +}; + +var parser = new CXParser(); +parser.init(); +fs.readFile( 'Food.html', 'utf8', function ( err, data ) { + if ( err ) { + return console.log( err ); + } + parser.write( data ); + process.stdout.write( parser.content.join( '' ) ); +} ); -- To view, visit https://gerrit.wikimedia.org/r/124287 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0b53b87f60494b49ad8b0fb7654dd859f6f97045 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Divec <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
