Santhosh has uploaded a new change for review. https://gerrit.wikimedia.org/r/175398
Change subject: MT: Introduce MTClient and all other MT clients inherit it ...................................................................... MT: Introduce MTClient and all other MT clients inherit it This allows us to provide generic annotation mapping libraries from parent MTClient class. Apertium and Yandex code organized as classes Change-Id: I5880060274856ad499d98089fbd65b732c284d37 --- M ContentTranslationService.js M mt/Apertium.js A mt/Apertium.languagenames.json M mt/Yandex.js D mt/mappings.js M tests/mt/Apertium.test.js 6 files changed, 226 insertions(+), 382 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/98/175398/1 diff --git a/ContentTranslationService.js b/ContentTranslationService.js index 142a1de..095e9f9 100644 --- a/ContentTranslationService.js +++ b/ContentTranslationService.js @@ -117,7 +117,7 @@ return; } - mtClient = mtClients[ provider ]; + mtClient = new mtClients[ provider ](); sourceHtmlChunks = [ '<div>' ]; reqLength = 0; diff --git a/mt/Apertium.js b/mt/Apertium.js index 0c1f9bc..1ca46b2 100644 --- a/mt/Apertium.js +++ b/mt/Apertium.js @@ -1,309 +1,16 @@ -var apertiumLangMapping, - Q = require( 'q' ), +var Q = require( 'q' ), + util = require( 'util' ), request = require( 'request' ), conf = require( __dirname + '/../utils/Conf.js' ), LinearDoc = require( '../lineardoc/LinearDoc' ), - //logger = require( '../utils/Logger.js' ), - // TODO: Tokenize properly. These work for English/Spanish/Catalan - TOKENS = /[\wáàçéèíïóòúüñÁÀÇÉÈÍÏÓÒÚÜÑ]+(?:[·'][\wáàçéèíïóòúüñÁÀÇÉÈÍÏÓÒÚÜÑ]+)?|[^\wáàçéèíïóòúüñÁÀÇÉÈÍÏÓÒÚÜÑ]+/g, - IS_WORD = /^[\wáàçéèíïóòúüñÁÀÇÉÈÍÏÓÒÚÜÑ]+(?:[·'][\wáàçéèíïóòúüñÁÀÇÉÈÍÏÓÒÚÜÑ]+)?$/; + MTClient = require( './MTClient.js' ), + apertiumLangMapping = require( './Apertium.languagenames.json' ); -apertiumLangMapping = require( __dirname + '/mappings.js' ); +function Apertium() { -/** - * Split text into tokens - * @param {string} lang Language code - * @param {string} text Text to split - * @return {Object[]} List of tokens - * @return[].text Text of the token - * @return[].isWord Whether the token is a word - */ -function getTokens( lang, text ) { - // TODO: implement for other languages than English/Spanish/Catalan - return text.match( TOKENS ).map( function ( tokenText ) { - return { - text: tokenText, - isWord: !!tokenText.match( IS_WORD ) - }; - } ); } -/** - * Language-aware uppercasing - * @param {string} lang Language code - * @param {string} text Text to uppercase - * @return {string} Upper-cased text (possibly identical) - */ -function toUpperCase( lang, text ) { - // stub: just use the javascript ASCII method for now - return text.toUpperCase(); -} - -/** - * Create variants of the text, with a different annotation uppercased in each. - * @param {string} lang Language code - * @param {string} text Text - * @param {Object[]} annotationOffsets start and length of each annotation - * @return {Object[]} - * @return[].start {number} Start offset of uppercasing - * @return[].length {number} Length of uppercasing - * @return[].text {string} Text variant with uppercasing - */ -function getCaseVariants( lang, sourceText, annotationOffsets ) { - var i, len, offset, chunk, upperChunk, variantText, - caseVariants = []; - - for ( i = 0, len = annotationOffsets.length; i < len; i++ ) { - offset = annotationOffsets[ i ]; - chunk = sourceText.slice( offset.start, offset.start + offset.length ); - upperChunk = toUpperCase( lang, chunk ); - if ( upperChunk === chunk ) { - // Already uppercased; can't detect change - continue; - } - variantText = [ - sourceText.slice( 0, offset.start ), - upperChunk, - sourceText.slice( offset.start + offset.length ) - ].join( '' ); - caseVariants.push( { - start: offset.start, - length: offset.length, - text: variantText - } ); - } - return caseVariants; -} - -/** - * Finds offsets of ranges at which tokens have changed to uppercase - * @param {string} text Original text - * @param {string} text Changed text - * @return {Object[]} start and length for each changed range - */ -function getChangedCaseRanges( lang, originalText, changedText ) { - var orig, upper, changed, len, ranges, start, startChar, end, endChar; - orig = getTokens( lang, originalText ); - upper = getTokens( lang, toUpperCase( lang, originalText ) ); - changed = getTokens( lang, changedText ); - - len = orig.length; - if ( len !== upper.length || len !== changed.length ) { - throw new Error( 'token length mismatch' ); - } - - // Find start/end of changed text token ranges. Track char ranges too, and store these. - ranges = []; - // start token - start = 0; - // start char - startChar = 0; - - while ( true ) { - // Skip to first changed word token - while ( start < len && ( - !( orig[ start ].isWord ) || - ( - orig[ start ].text === changed[ start ].text || - upper[ start ].text !== changed[ start ].text - ) - ) ) { - startChar += orig[ start ].text.length; - start++; - } - if ( start >= len ) { - break; - } - // Find last consecutive changed non-word token - end = start; - endChar = startChar + orig[ end ].text.length; - - while ( end < len && ( - !( orig[ end ].isWord ) || - ( - orig[ end ].text !== upper[ end ].text && - upper[ end ].text === changed[ end ].text - ) - ) ) { - end++; - if ( end < len ) { - endChar += orig[ end ].text.length; - } - } - do { - if ( end < len ) { - endChar -= orig[ end ].text.length; - } - end--; - } while ( !( orig[ end ].isWord ) ); - // Store ranges - ranges.push( { - start: startChar, - length: endChar - startChar - } ); - start = end + 1; - startChar = endChar; - } - return ranges; -} - -/** - * Calculate range mappings based on the target text variants - * @param {string} targetLang The target language - * @param {Object[]} sourceVariants The start and length of each variation - * @param { - * @param {Object} annotationOffsets The start and length of each offset, by sourceVariantId - */ -function getRangeMappings( targetLang, sourceVariants, targetText, targetLines ) { - var i, iLen, j, jLen, changedCaseRanges, sourceRange, - rangeMappings = []; - if ( sourceVariants.length !== targetLines.length ) { - throw new Error( 'Translation variants length mismatch' ); - } - for ( i = 0, iLen = sourceVariants.length; i < iLen; i++ ) { - sourceRange = { - start: sourceVariants[ i ].start, - length: sourceVariants[ i ].length - }; - changedCaseRanges = getChangedCaseRanges( - targetLang, - targetText, - targetLines[ i ] - ); - for ( j = 0, jLen = changedCaseRanges.length; j < jLen; j++ ) { - rangeMappings.push( { - source: sourceRange, - target: changedCaseRanges[ j ] - } ); - } - } - return rangeMappings; -} - -/** - * Translate plain text with Apertium API - * @param {string} sourceLang Source language code - * @param {string} targetLang Target language code - * @param {string} sourceText Source language text - * @return {Object} Deferred promise: Target language text - */ -function translateTextApertium( sourceLang, targetLang, sourceText ) { - var deferred = Q.defer(), - postData; - - postData = { - url: conf( 'mt.apertium.api' ) + '/translate', - form: { - markUnknown: 0, - langpair: apertiumLangMapping[ sourceLang ] + '|' + apertiumLangMapping[ targetLang ], - format: 'txt', - q: sourceText - } - }; - request.post( postData, - function ( error, response, body ) { - var message; - if ( error ) { - deferred.reject( new Error( error ) ); - return; - } - if ( response.statusCode !== 200 ) { - message = 'Error ' + response.statusCode; - message += ' sourceText={' + sourceText + '}, body={' + body + '}'; - deferred.reject( new Error( message ) ); - return; - } - deferred.resolve( JSON.parse( body ).responseData.translatedText ); - } - ); - return deferred.promise; -} - -/** - * Translate multiple lines of plaintext with apertium - * @param {string} sourceLang Source language code - * @param {string} targetLang Target language code - * @param {string[]} sourceLines Source plaintext lines - * @return {Object} Deferred promise: Translated plaintext lines - */ -function translateLinesApertium( sourceLang, targetLang, sourceLines ) { - var sourceLinesText, - deferred = Q.defer(); - // Join lines into single string. Separator must break sentences and pass through unchanged - sourceLinesText = sourceLines.join( '\n.CxServerApertium.\n' ); - translateTextApertium( - sourceLang, - targetLang, - sourceLinesText - ).then( function ( targetLinesText ) { - var targetText = targetLinesText - .replace( /^\s+|\s+$/g, '' ) - .split( /\n\.CxServerApertium\.\n/g ); - deferred.resolve( targetText ); - }, function ( error ) { - deferred.reject( error ); - } ); - return deferred.promise; -} - -/** - * Translate text, using case variants to map tag offsets - * @param {string} sourceLang Source language code - * @param {string} targetLang Target language code - * @param {string} sourceText Source plain text - * @param {Object[]} tagOffsets start and length for each annotation chunk - * @return {Object} Deferred promise: Translated plain text and range mappings - */ -function translateTextWithTagOffsets( sourceLang, targetLang, sourceText, tagOffsets ) { - var sourceVariants, sourceLines, m, preSpace, postSpace, trimmedSourceLines, deferred; - sourceVariants = getCaseVariants( sourceLang, sourceText, tagOffsets ); - sourceLines = sourceVariants.map( function ( variant ) { - return variant.text; - } ); - sourceLines.splice( 0, 0, sourceText ); - - // Don't push leading and trailing whitespace through Apertium - m = sourceText.match( /^(\s*).*?(\s*)$/ ); - preSpace = m[ 1 ]; - postSpace = m[ 2 ]; - trimmedSourceLines = sourceLines.map( function ( line ) { - return line.substring( preSpace.length, line.length - postSpace.length ); - } ); - - deferred = Q.defer(); - // Call apertium through module.exports, so tests can override it - // Join segments with a string that will definitely break sentences and be preserved - module.exports.translateLinesApertium( - sourceLang, - targetLang, - trimmedSourceLines - ).then( function ( trimmedTargetLines ) { - var targetLines, targetText, rangeMappings; - targetLines = trimmedTargetLines.map( function ( trimmedTargetLine ) { - return preSpace + trimmedTargetLine + postSpace; - } ); - try { - targetText = targetLines.splice( 0, 1 )[ 0 ]; - rangeMappings = getRangeMappings( - targetLang, - sourceVariants, - targetText, - targetLines - ); - } catch ( ex ) { - deferred.reject( ex ); - return; - } - deferred.resolve( { - text: targetText, - rangeMappings: rangeMappings - } ); - }, function ( error ) { - deferred.reject( error ); - } ); - return deferred.promise; -} - +util.inherits( Apertium, MTClient ); /** * Translate marked-up text * @param {string} sourceLang Source language code @@ -311,23 +18,24 @@ * @param {string} sourceText Source html * @return {Object} Deferred promise: Translated html */ -function translate( sourceLang, targetLang, sourceHtml ) { +Apertium.prototype.translate = function ( sourceLang, targetLang, sourceHtml ) { var i, len, sourceDoc, targetDoc, itemPromises, deferred, + apertium = this, parser = new LinearDoc.Parser(); + parser.init(); parser.write( sourceHtml ); sourceDoc = parser.builder.doc; // Clone and adapt sourceDoc targetDoc = new LinearDoc.Doc( sourceDoc.wrapperTag ); itemPromises = []; - function translateItemDeferred( deferred, item ) { itemPromises.push( deferred.promise ); if ( item.type !== 'textblock' ) { deferred.resolve( item ); return; } - translateTextWithTagOffsets( + apertium.translateTextWithTagOffsets( sourceLang, targetLang, item.item.getPlainText(), @@ -359,11 +67,139 @@ }, function ( error ) { deferred.reject( error ); } ); - return deferred.promise; -} -module.exports = { - translate: translate, - translateLinesApertium: translateLinesApertium, - getTokens: getTokens + return deferred.promise; }; + +/** + * Translate text, using case variants to map tag offsets + * @param {string} sourceLang Source language code + * @param {string} targetLang Target language code + * @param {string} sourceText Source plain text + * @param {Object[]} tagOffsets start and length for each annotation chunk + * @return {Object} Deferred promise: Translated plain text and range mappings + */ +Apertium.prototype.translateTextWithTagOffsets = function ( sourceLang, targetLang, sourceText, tagOffsets ) { + var sourceVariants, sourceLines, m, preSpace, postSpace, trimmedSourceLines, deferred, + self = this; + + sourceVariants = this.getCaseVariants( sourceLang, sourceText, tagOffsets ); + sourceLines = sourceVariants.map( function ( variant ) { + return variant.text; + } ); + sourceLines.splice( 0, 0, sourceText ); + + // Don't push leading and trailing whitespace through Apertium + m = sourceText.match( /^(\s*).*?(\s*)$/ ); + preSpace = m[ 1 ]; + postSpace = m[ 2 ]; + trimmedSourceLines = sourceLines.map( function ( line ) { + return line.substring( preSpace.length, line.length - postSpace.length ); + } ); + + deferred = Q.defer(); + // Join segments with a string that will definitely break sentences and be preserved + self.translateLines( + sourceLang, + targetLang, + trimmedSourceLines + ).then( function ( trimmedTargetLines ) { + var targetLines, targetText, rangeMappings; + + targetLines = trimmedTargetLines.map( function ( trimmedTargetLine ) { + return preSpace + trimmedTargetLine + postSpace; + } ); + + try { + targetText = targetLines.splice( 0, 1 )[ 0 ]; + rangeMappings = self.getRangeMappings( + targetLang, + sourceVariants, + targetText, + targetLines + ); + } catch ( ex ) { + deferred.reject( ex ); + return; + } + deferred.resolve( { + text: targetText, + rangeMappings: rangeMappings + } ); + }, function ( error ) { + deferred.reject( error ); + } ); + + return deferred.promise; +}; + +/** + * Translate multiple lines of plaintext with apertium + * @param {string} sourceLang Source language code + * @param {string} targetLang Target language code + * @param {string[]} sourceLines Source plaintext lines + * @return {Object} Deferred promise: Translated plaintext lines + */ +Apertium.prototype.translateLines = function ( sourceLang, targetLang, sourceLines ) { + var sourceLinesText, + deferred = Q.defer(); + + // Join lines into single string. Separator must break sentences and pass through unchanged + sourceLinesText = sourceLines.join( '\n.CxServerApertium.\n' ); + + this.translateText( + sourceLang, + targetLang, + sourceLinesText + ).then( function ( targetLinesText ) { + var targetText = targetLinesText + .replace( /^\s+|\s+$/g, '' ) + .split( /\n\.CxServerApertium\.\n/g ); + deferred.resolve( targetText ); + }, function ( error ) { + deferred.reject( error ); + } ); + return deferred.promise; +}; + +/** + * Translate plain text with Apertium API + * @param {string} sourceLang Source language code + * @param {string} targetLang Target language code + * @param {string} sourceText Source language text + * @return {Object} Deferred promise: Target language text + */ +Apertium.prototype.translateText = function ( sourceLang, targetLang, sourceText ) { + var deferred = Q.defer(), + postData; + + postData = { + url: conf( 'mt.apertium.api' ) + '/translate', + form: { + markUnknown: 0, + langpair: apertiumLangMapping[ sourceLang ] + '|' + apertiumLangMapping[ targetLang ], + format: 'txt', + q: sourceText + } + }; + request.post( postData, + function ( error, response, body ) { + var message; + + if ( error ) { + deferred.reject( new Error( error ) ); + return; + } + if ( response.statusCode !== 200 ) { + message = 'Error ' + response.statusCode; + message += ' sourceText={' + sourceText + '}, body={' + body + '}'; + deferred.reject( new Error( message ) ); + return; + } + deferred.resolve( JSON.parse( body ).responseData.translatedText ); + } + ); + return deferred.promise; +}; + +module.exports = Apertium; diff --git a/mt/Apertium.languagenames.json b/mt/Apertium.languagenames.json new file mode 100644 index 0000000..945d63d --- /dev/null +++ b/mt/Apertium.languagenames.json @@ -0,0 +1,38 @@ +{ + "af": "afr", + "an": "arg", + "ar": "ara", + "bg": "bul", + "br": "bre", + "bs": "hbs_BS", + "ca": "cat", + "cr": "hbs_HR", + "cy": "cym", + "da": "dan", + "en": "eng", + "eo": "epo", + "es": "spa", + "eu": "eus", + "fr": "fra", + "gl": "glg", + "hr": "hbs_HR", + "id": "ind", + "is": "isl", + "it": "ita", + "kk": "kaz", + "la": "lat", + "mk": "mkd", + "ms": "msa", + "mt": "mlt", + "nb": "nob", + "nl": "nld", + "nn": "nno", + "oc": "oci", + "pt": "por", + "ro": "ron", + "sh": "hbs", + "sl": "slv", + "sr": "hbs_SR", + "sv": "swe", + "tt": "tat" +} \ No newline at end of file diff --git a/mt/Yandex.js b/mt/Yandex.js index 0e24008..1f40d88 100644 --- a/mt/Yandex.js +++ b/mt/Yandex.js @@ -1,31 +1,14 @@ -var errormap, - Q = require( 'q' ), +var Q = require( 'q' ), request = require( 'request' ), + util = require( 'util' ), + MTClient = require( './MTClient.js' ), conf = require( __dirname + '/../utils/Conf.js' ); -// http://api.yandex.com/translate/doc/dg/reference/translate.xml -errormap = { - 200: 'ERR_OK', - 401: 'ERR_KEY_INVALID', - 402: 'ERR_KEY_BLOCKED', - 403: 'ERR_DAILY_REQ_LIMIT_EXCEEDED', - 404: 'ERR_DAILY_CHAR_LIMIT_EXCEEDED', - 413: 'ERR_TEXT_TOO_LONG', - 422: 'ERR_UNPROCESSABLE_TEXT', - 501: 'ERR_LANG_NOT_SUPPORTED' -}; +function Yandex() { -/** - * Returns error name from error code. - * @return {string} - */ -function getErrorName( code ) { - if ( code in errormap ) { - return errormap[code]; - } - - return 'Unknown error'; } + +util.inherits( Yandex, MTClient ); /** * Translate plain text with Yandex. @@ -35,8 +18,9 @@ * @param {string} sourceText Source language text * @return {Q.Promise} Target language text */ -function translate( sourceLang, targetLang, sourceText ) { +Yandex.prototype.translate = function ( sourceLang, targetLang, sourceText ) { var key, postData, + self = this, deferred = Q.defer(); key = conf( 'mt.yandex.key' ); @@ -78,15 +62,37 @@ console.log( ret ); if ( ret.code !== 200 ) { - deferred.reject( new Error( ret.code + ': ' + getErrorName( ret.code ) ) ); + deferred.reject( new Error( ret.code + ': ' + self.getErrorName( ret.code ) ) ); } - deferred.resolve( ret.text[0] ); + deferred.resolve( ret.text[ 0 ] ); } ); return deferred.promise; -} - -module.exports = { - translate: translate }; + +/** + * Returns error name from error code. + * @return {string} + */ +Yandex.prototype.getErrorName = function ( code ) { + // http://api.yandex.com/translate/doc/dg/reference/translate.xml + var errormap = { + 200: 'ERR_OK', + 401: 'ERR_KEY_INVALID', + 402: 'ERR_KEY_BLOCKED', + 403: 'ERR_DAILY_REQ_LIMIT_EXCEEDED', + 404: 'ERR_DAILY_CHAR_LIMIT_EXCEEDED', + 413: 'ERR_TEXT_TOO_LONG', + 422: 'ERR_UNPROCESSABLE_TEXT', + 501: 'ERR_LANG_NOT_SUPPORTED' + }; + + if ( code in errormap ) { + return errormap[ code ]; + } + + return 'Unknown error'; +}; + +module.exports = Yandex; diff --git a/mt/mappings.js b/mt/mappings.js deleted file mode 100644 index 330eb55..0000000 --- a/mt/mappings.js +++ /dev/null @@ -1,38 +0,0 @@ -module.exports = { - af: 'afr', - an: 'arg', - ar: 'ara', - bg: 'bul', - br: 'bre', - bs: 'hbs_BS', - ca: 'cat', - cr: 'hbs_HR', - cy: 'cym', - da: 'dan', - en: 'eng', - eo: 'epo', - es: 'spa', - eu: 'eus', - fr: 'fra', - gl: 'glg', - hr: 'hbs_HR', - id: 'ind', - is: 'isl', - it: 'ita', - kk: 'kaz', - la: 'lat', - mk: 'mkd', - ms: 'msa', - mt: 'mlt', - nb: 'nob', - nl: 'nld', - nn: 'nno', - oc: 'oci', - pt: 'por', - ro: 'ron', - sh: 'hbs', - sl: 'slv', - sr: 'hbs_SR', - sv: 'swe', - tt: 'tat' -}; diff --git a/tests/mt/Apertium.test.js b/tests/mt/Apertium.test.js index 7d2c5a9..d957a47 100644 --- a/tests/mt/Apertium.test.js +++ b/tests/mt/Apertium.test.js @@ -83,12 +83,13 @@ var textTranslations; // Fake the actual Apertium call - CX.Apertium.translateLinesApertium = function ( sourceLang, targetLang, sourceLines ) { + CX.Apertium.prototype.translateLines = function ( sourceLang, targetLang, sourceLines ) { var deferred = Q.defer(); + setTimeout( function () { var targetLines; try { - sourceLines.map( function ( line ) { + targetLines = sourceLines.map( function ( line ) { return textTranslations[ line ] || 'X' + line + 'X'; } ); deferred.resolve( targetLines ); @@ -102,7 +103,9 @@ QUnit.expect( tests.length ); function resumeTests( i ) { - var test; + var test, + apertium = new CX.Apertium(); + if ( i >= tests.length ) { return; } @@ -110,8 +113,7 @@ textTranslations = test.textTranslations; QUnit.stop(); - - CX.Apertium.translate( 'xx', 'yy', test.source ).then( function ( target ) { + apertium.translate( 'en', 'es', test.source ).then( function ( target ) { assert.strictEqual( target, test.target, -- To view, visit https://gerrit.wikimedia.org/r/175398 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I5880060274856ad499d98089fbd65b732c284d37 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits