Santhosh has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/372503 )
Change subject: Refresh MT client modules as classes - ES6 upgrade ...................................................................... Refresh MT client modules as classes - ES6 upgrade Change-Id: I0a9aaf9b71ee53d093956995cbe3073001f62081 --- M lib/mt/Apertium.js M lib/mt/MTClient.js M lib/mt/Yandex.js M lib/mt/Youdao.js 4 files changed, 547 insertions(+), 589 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/03/372503/1 diff --git a/lib/mt/Apertium.js b/lib/mt/Apertium.js index 89f8575..af9af5d 100644 --- a/lib/mt/Apertium.js +++ b/lib/mt/Apertium.js @@ -1,47 +1,39 @@ 'use strict'; var preq = require( 'preq' ), - util = require( 'util' ), MTClient = require( './MTClient.js' ), - apertiumLangMapping = require( './Apertium.languagenames.json' ), - postData; + apertiumLangMapping = require( './Apertium.languagenames.json' ); -function Apertium( options ) { - this.logger = options.logger; - this.conf = options.conf; +class Apertium extends MTClient { + /** + * Translate plain text with Apertium API + * Apertium is not capable of HTML translation with all annotation + * mapping. For translating HTML, It use CX's annotation mapping on top + * of the plaintext translation. Hence it inherits translateHTML method + * of MTClient. + * + * @param {string} sourceLang Source language code + * @param {string} targetLang Target language code + * @param {string} sourceText Source language text + * @return {BBPromise} promise: Target language text + */ + translateText( sourceLang, targetLang, sourceText ) { + const postData = { + uri: this.conf.mt.apertium.api + '/translate', + body: { + markUnknown: 0, + langpair: apertiumLangMapping[ sourceLang ] + '|' + apertiumLangMapping[ targetLang ], + format: 'txt', + q: sourceText + } + }; + + return preq.post( postData ) + .then( ( response ) => response.body.responseData.translatedText ) + .catch( function () { + throw new Error( 'Translation with Apertium failed: ' + sourceLang + '-' + targetLang ); + } ); + } } - -util.inherits( Apertium, MTClient ); - -/** - * Translate plain text with Apertium API - * Apertium is not capable of HTML translation with all annotation - * mapping. For translating HTML, It use CX's annotation mapping on top - * of the plaintext translation. Hence it inherits translateHTML method - * of MTClient. - * - * @param {string} sourceLang Source language code - * @param {string} targetLang Target language code - * @param {string} sourceText Source language text - * @return {BBPromise} promise: Target language text - */ -Apertium.prototype.translateText = function ( sourceLang, targetLang, sourceText ) { - postData = { - uri: this.conf.mt.apertium.api + '/translate', - body: { - markUnknown: 0, - langpair: apertiumLangMapping[ sourceLang ] + '|' + apertiumLangMapping[ targetLang ], - format: 'txt', - q: sourceText - } - }; - - return preq.post( postData ).then( function ( response ) { - return response.body.responseData.translatedText; - } ).catch( function () { - throw new Error( 'Translation with Apertium failed: ' + - sourceLang + '-' + targetLang ); - } ); -}; module.exports = Apertium; diff --git a/lib/mt/MTClient.js b/lib/mt/MTClient.js index 1190ff4..1cce712 100644 --- a/lib/mt/MTClient.js +++ b/lib/mt/MTClient.js @@ -1,80 +1,61 @@ 'use strict'; -var LinearDoc = require( __dirname + '/../lineardoc' ), - BBPromise = require( 'bluebird' ), +const LinearDoc = require( __dirname + '/../lineardoc' ), SubSequenceMatcher = require( './annotationmapper/SubsequenceMatcher.js' ), createDOMPurify = require( 'dompurify' ), jsdom = require( 'jsdom' ); /** * MTClient - Generic machine translation client. - * - * @class - * - * @param {Object} options */ -function MTClient( options ) { - this.logger = options.logger; - this.conf = options.conf; - this.sourceDoc = null; - this.sourceHTML = null; -} - -MTClient.prototype.log = function ( level, info ) { - if ( this.logger && this.logger.log ) { - this.logger.log( level, info ); +class MTClient { + /** + * @param {Object} options + */ + constructor( options ) { + this.logger = options.logger; + this.conf = options.conf; + this.sourceDoc = null; + this.sourceHTML = null; } -}; -/** - * Translate the given content between the language pairs. - * - * @param {string} sourceLang Source language code - * @param {string} targetLang Target language code - * @param {string} content Content to translate - * @param {string} [format="html"] Format of the content- html or text - * @return {Object} Deferred promise: Target language text - */ -MTClient.prototype.translate = function ( sourceLang, targetLang, content, format ) { - if ( format === 'text' ) { - return this.translateText( sourceLang, targetLang, content ); - } else { - return this.translateHtml( sourceLang, targetLang, content ); + log( level, info ) { + if ( this.logger && this.logger.log ) { + this.logger.log( level, info ); + } } -}; -/** - * Translate marked-up text - * - * @param {string} sourceLang Source language code - * @param {string} targetLang Target language code - * @param {string} sourceHtml Source html - * @return {Promise} promise: Translated html - */ -MTClient.prototype.translateHtml = function ( sourceLang, targetLang, sourceHtml ) { - var i, len, targetDoc, chain = [], - self = this; + /** + * Translate the given content between the language pairs. + * + * @param {string} sourceLang Source language code + * @param {string} targetLang Target language code + * @param {string} content Content to translate + * @param {string} [format="html"] Format of the content- html or text + * @return {Object} Deferred promise: Target language text + */ + translate( sourceLang, targetLang, content, format ) { + if ( format === 'text' ) { + return this.translateText( sourceLang, targetLang, content ); + } else { + return this.translateHtml( sourceLang, targetLang, content ); + } + } - this.buildSourceDoc( sourceHtml ); - // Clone and adapt sourceDoc - targetDoc = new LinearDoc.Doc( this.sourceDoc.wrapperTag ); - - function translateItemDeferred( item ) { + translateItemDeferred( item, sourceLang, targetLang ) { if ( item.type !== 'textblock' ) { - return BBPromise.resolve( item ); + return Promise.resolve( item ); } - return self.translateTextWithTagOffsets( + return this.translateTextWithTagOffsets( sourceLang, targetLang, item.item.getPlainText(), item.item.getTagOffsets() - ).then( function ( translated ) { + ).then( ( translated ) => { var newTextBlock; - newTextBlock = item.item.translateTags( - translated.text, translated.rangeMappings - ); + newTextBlock = item.item.translateTags( translated.text, translated.rangeMappings ); return { type: 'textblock', @@ -83,310 +64,319 @@ } ); } - for ( i = 0, len = this.sourceDoc.items.length; i < len; i++ ) { - chain.push( translateItemDeferred( this.sourceDoc.items[ i ] ) ); + /** + * Translate marked-up text + * + * @param {string} sourceLang Source language code + * @param {string} targetLang Target language code + * @param {string} sourceHtml Source html + * @return {Promise} Promise that resolves Translated html + */ + translateHtml( sourceLang, targetLang, sourceHtml ) { + var targetDoc, chain = []; + + this.buildSourceDoc( sourceHtml ); + // Clone and adapt sourceDoc + targetDoc = new LinearDoc.Doc( this.sourceDoc.wrapperTag ); + + for ( let i = 0, len = this.sourceDoc.items.length; i < len; i++ ) { + chain.push( this.translateItemDeferred( this.sourceDoc.items[ i ], sourceLang, targetLang ) ); + } + + return Promise.all( chain ).then( ( results ) => { + targetDoc.items = results; + // Return sanitized HTML output + return this.sanitize( targetDoc.getHtml() ); + } ); } - return BBPromise.all( chain ).then( function ( results ) { - targetDoc.items = results; - // Return sanitized HTML output - return self.sanitize( targetDoc.getHtml() ); - } ); -}; - -/** - * Sanitize given HTML using DOMPurify - * @param {string} html Dirty HTML - * @return {string} sanitized HTML output - */ -MTClient.prototype.sanitize = function ( html ) { - if ( !this.DOMPurify ) { + /** + * Sanitize given HTML using DOMPurify + * @param {string} html Dirty HTML + * @return {string} sanitized HTML output + */ + sanitize( html ) { + if ( !this.DOMPurify ) { // Lazy initialize DOMPurify - this.DOMPurify = createDOMPurify( ( new jsdom.JSDOM( '' ) ).window ); + this.DOMPurify = createDOMPurify( ( new jsdom.JSDOM( '' ) ).window ); + } + + if ( !this.DOMPurify.isSupported ) { + throw new Error( 'DOMPurify not suppported in the DOM environment provided by JSDOM' ); + } + + return this.DOMPurify.sanitize( html, { + ADD_ATTR: [ 'typeof' ], // typeof is not a known attribute for DOMPurify + ADD_URI_SAFE_ATTR: [ 'rel', 'typeof' ] // Without this rel="mw:WikiLink" attributes will be removed. + } ); } - if ( !this.DOMPurify.isSupported ) { - throw new Error( 'DOMPurify not suppported in the DOM environment provided by JSDOM' ); - } + /** + * Translate text, using case variants to map tag offsets + * + * @param {string} sourceLang Source language code + * @param {string} targetLang Target language code + * @param {string} sourceText Source plain text + * @param {Object[]} tagOffsets start and length for each annotation chunk + * @return {Object} Deferred promise: Translated plain text and range mappings + */ + translateTextWithTagOffsets( sourceLang, targetLang, sourceText, tagOffsets ) { + var subSequences, sourceLines, i, m, preSpaces, postSpaces, trimmedSourceLines; - return this.DOMPurify.sanitize( html, { - ADD_ATTR: [ 'typeof' ], // typeof is not a known attribute for DOMPurify - ADD_URI_SAFE_ATTR: [ 'rel', 'typeof' ] // Without this rel="mw:WikiLink" attributes will be removed. - } ); -}; + subSequences = this.getSubSequences( sourceLang, sourceText, tagOffsets ); + sourceLines = subSequences.map( ( variant ) => variant.text ); + sourceLines.splice( 0, 0, sourceText ); -/** - * Translate text, using case variants to map tag offsets - * - * @param {string} sourceLang Source language code - * @param {string} targetLang Target language code - * @param {string} sourceText Source plain text - * @param {Object[]} tagOffsets start and length for each annotation chunk - * @return {Object} Deferred promise: Translated plain text and range mappings - */ -MTClient.prototype.translateTextWithTagOffsets = function ( sourceLang, targetLang, sourceText, tagOffsets ) { - var subSequences, sourceLines, i, m, preSpaces, postSpaces, trimmedSourceLines, - self = this; - - subSequences = this.getSubSequences( sourceLang, sourceText, tagOffsets ); - sourceLines = subSequences.map( function ( variant ) { - return variant.text; - } ); - sourceLines.splice( 0, 0, sourceText ); - - // Strip and store leading/trailing whitespace before sending text to MT server - preSpaces = []; - postSpaces = []; - trimmedSourceLines = []; - for ( i = 0; i < sourceLines.length; i++ ) { + // Strip and store leading/trailing whitespace before sending text to MT server + preSpaces = []; + postSpaces = []; + trimmedSourceLines = []; + for ( i = 0; i < sourceLines.length; i++ ) { // Search for zero or more leading and trailing spaces. This will always match. - m = sourceLines[ i ].match( /^(\s*)([\s\S]*?)(\s*)$/ ); - if ( !m ) { + m = sourceLines[ i ].match( /^(\s*)([\s\S]*?)(\s*)$/ ); + if ( !m ) { // See https://phabricator.wikimedia.org/T86625. This not supposed to happen. - this.log( 'error', 'Regex to extract trailing and leading space failed for ' + sourceLines[ i ] ); - m = [ '', '', sourceLines[ i ], '' ]; + this.log( 'error', 'Regex to extract trailing and leading space failed for ' + sourceLines[ i ] ); + m = [ '', '', sourceLines[ i ], '' ]; + } + preSpaces[ i ] = m[ 1 ]; + trimmedSourceLines[ i ] = m[ 2 ]; + postSpaces[ i ] = m[ 3 ]; } - preSpaces[ i ] = m[ 1 ]; - trimmedSourceLines[ i ] = m[ 2 ]; - postSpaces[ i ] = m[ 3 ]; - } - // Join segments with a string that will definitely break sentences and be preserved - return self.translateLines( - sourceLang, - targetLang, - trimmedSourceLines - ).then( function ( unnormalizedTargetLines ) { - var targetLines, targetText, rangeMappings; + // Join segments with a string that will definitely break sentences and be preserved + return this.translateLines( + sourceLang, + targetLang, + trimmedSourceLines + ).then( ( unnormalizedTargetLines ) => { + var targetLines, targetText, rangeMappings; - // Restore leading/trailing whitespace from source - targetLines = unnormalizedTargetLines.map( function ( line, i ) { - return preSpaces[ i ] + line.replace( /^\s+|\s+$/g, '' ) + postSpaces[ i ]; + // Restore leading/trailing whitespace from source + targetLines = unnormalizedTargetLines + .map( ( line, i ) => preSpaces[ i ] + line.replace( /^\s+|\s+$/g, '' ) + postSpaces[ i ] ); + + try { + targetText = targetLines.splice( 0, 1 )[ 0 ]; + rangeMappings = this.getSequenceMappings( + targetLang, + subSequences, + targetText, + targetLines + ); + } catch ( ex ) { + // If annotation mapping fails for any reason, return translated text + // without annotations. + this.log( 'debug/mt', 'Error while mapping annotations ' + ex.stack ); + rangeMappings = {}; + } + return { + text: targetText, + rangeMappings: rangeMappings + }; } ); - try { - targetText = targetLines.splice( 0, 1 )[ 0 ]; - rangeMappings = self.getSequenceMappings( - targetLang, - subSequences, - targetText, - targetLines - ); - } catch ( ex ) { - // If annotation mapping fails for any reason, return translated text - // without annotations. - self.log( 'debug/mt', 'Error while mapping annotations ' + ex.stack ); - rangeMappings = {}; - } - return { - text: targetText, - rangeMappings: rangeMappings - }; - } ); -}; - -/** - * Translate multiple lines of plaintext - * - * The output may need normalizing for leading/trailing whitespace etc. - * - * @param {string} sourceLang Source language code - * @param {string} targetLang Target language code - * @param {string[]} sourceLines Source plaintext lines - * @return {Promise} Translated plaintext lines - */ -MTClient.prototype.translateLines = function ( sourceLang, targetLang, sourceLines ) { - var sourceLinesText; - - // Join lines into single string. Separator must break sentences and pass through unchanged - // Using Devangari separator Double Danda twice. - sourceLinesText = sourceLines.join( '.॥॥.' ); - - return this.translateText( - sourceLang, - targetLang, - sourceLinesText - ).then( function ( targetLinesText ) { - var targetText = targetLinesText.split( /\.॥॥\./g ); - return targetText; - } ); -}; - -/** - * Create variants of the text, with a different annotation uppercased in each. - * - * @param {string} lang Language code - * @param {string} sourceText Text - * @param {Object[]} annotationOffsets start and length of each annotation - * @return {Object[]} - * @return {number} Object.start Start offset of uppercasing - * @return {number} Object.length Length of uppercasing - * @return {string} Object.text Text variant with uppercasing - */ -MTClient.prototype.getSubSequences = function ( lang, sourceText, annotationOffsets ) { - var i, len, offset, subSequences = []; - - for ( i = 0, len = annotationOffsets.length; i < len; i++ ) { - offset = annotationOffsets[ i ]; - subSequences.push( { - start: offset.start, - length: offset.length, - text: sourceText.slice( offset.start, offset.start + offset.length ) - } ); - } - return subSequences; -}; - -/** - * Check if a range already exist in the array of ranges already located. - * A range is start position and length indicating position of certain text - * in a bigger text. - * This is not just a membership check. If the range we are checking - * falls under the start and end position of an already existing range, then also - * we consider it as an overlapping range. - * For example [start:5, length:4] and [start:6, length:3] overlaps. - * - * @param {Object} range - * @param {Object[]} rangeArray - * @return {boolean} Whether the range overlap or exist in any range in the given - * range array - */ -function isOverlappingRange( range, rangeArray ) { - var i, rangeStart, rangeEnd, start, end; - - rangeStart = range.start; - rangeEnd = range.start + range.length; - for ( i = 0; i < rangeArray.length; i++ ) { - start = rangeArray[ i ].start; - end = start + rangeArray[ i ].length; - if ( start >= rangeStart && end <= rangeEnd || - start <= rangeStart && end >= rangeEnd ) { - return true; - } } - return false; -} + /** + * Translate multiple lines of plaintext + * + * The output may need normalizing for leading/trailing whitespace etc. + * + * @param {string} sourceLang Source language code + * @param {string} targetLang Target language code + * @param {string[]} sourceLines Source plaintext lines + * @return {Promise} Translated plaintext lines + */ + translateLines( sourceLang, targetLang, sourceLines ) { + var sourceLinesText; -/** - * Calculate range mappings based on the target text variants. - * - * @param {string} targetLang The target language. - * @param {Object[]} subSequences The start and length of each subsequence. - * @param {string} targetText The translated text. - * @param {Object} targetLines Translation of each subsequences. - * @return {Object[]} The location of source and translation sequences in the text. - * @return {number} Object.source.start {number} Start position of source subSequence in the text. - * @return {number} Object.source.length {number} Length of source subSequence in the text. - * @return {number} Object.target.start {number} Start position of sequence in the text. - * @return {number} Object.target.length {number} Length of matched sequence in the text. - */ -MTClient.prototype.getSequenceMappings = function ( targetLang, subSequences, targetText, targetLines ) { - var i, iLen, targetRange, sourceRange, subSequence, - rangeMappings = [], - targetRanges = [], - occurrences = {}; + // Join lines into single string. Separator must break sentences and pass through unchanged + // Using Devangari separator Double Danda twice. + sourceLinesText = sourceLines.join( '.॥॥.' ); - if ( subSequences.length !== targetLines.length ) { - // We must have translation for all subSequences. - throw new Error( 'Translation variants length mismatch' ); + return this.translateText( + sourceLang, + targetLang, + sourceLinesText + ).then( ( targetLinesText ) => targetLinesText.split( /\.॥॥\./g ) ); } - for ( i = 0, iLen = subSequences.length; i < iLen; i++ ) { - subSequence = subSequences[ i ]; - sourceRange = { - start: subSequence.start, - length: subSequence.length - }; - // Keep track of repeated occurrences of a subsequence in the text. A word can repeat - // in a translation block. - occurrences[ subSequence.text ] = - occurrences[ subSequence.text ] === undefined ? 0 : occurrences[ subSequence.text ] + 1; - // Find the position of the translated subsequence in translated text. - // This involves a non-trivial fuzzy matching algorithm - targetRange = this.findSubSequence( - targetText, targetLines[ i ], targetLang, occurrences[ subSequence.text ] - ); + /** + * Create variants of the text, with a different annotation uppercased in each. + * + * @param {string} lang Language code + * @param {string} sourceText Text + * @param {Object[]} annotationOffsets start and length of each annotation + * @return {Object[]} + * @return {number} Object.start Start offset of uppercasing + * @return {number} Object.length Length of uppercasing + * @return {string} Object.text Text variant with uppercasing + */ + getSubSequences( lang, sourceText, annotationOffsets ) { + var offset, subSequences = []; - if ( targetRange && !isOverlappingRange( targetRange, targetRanges ) ) { - // targetRanges keep track of all ranges we located. Used for overlap - // detection. - targetRanges.push( targetRange ); - rangeMappings.push( { - source: sourceRange, - target: targetRange + for ( let i = 0, len = annotationOffsets.length; i < len; i++ ) { + offset = annotationOffsets[ i ]; + subSequences.push( { + start: offset.start, + length: offset.length, + text: sourceText.slice( offset.start, offset.start + offset.length ) } ); } - } - return rangeMappings; -}; - -/** - * Locate the given sequence in the translated text. - * Example: - * Searching 'tropical' in 'They are subtropical and tropical flowers.', 'tropical', - * returns { start: 12, length: 8 } - * - * @param {string} text The translated text. - * @param {string} sequence The search string. - * @param {string} language Language of the text. Used for language specific matching. - * @param {number} occurrence Pass 1 for first occurrence, 2 for second occurrence, so on. - * @return {null|Object} The location of the sequence in the text. - * @return {null|number} Object.start {number} Start position of sequence in the text. - * @return {null|number} Object.lengthLength of matched sequence in the text. - */ -MTClient.prototype.findSubSequence = function ( text, sequence, language, occurrence ) { - var indices, matcher; - - matcher = new SubSequenceMatcher( language ); - indices = matcher.findFuzzyMatch( text, sequence ); - // Find the nth occurrence position - - if ( !indices || indices.length < occurrence ) { - return null; - } - if ( occurrence === 0 ) { - return matcher.bestMatch( indices ); - } - return indices[ occurrence ]; -}; - -/** - * Build the LinearDoc for the given source html - * - * @param {string} sourceHtml The html content - */ -MTClient.prototype.buildSourceDoc = function ( sourceHtml ) { - var parser; - - if ( this.sourceDoc ) { - return; + return subSequences; } - if ( !sourceHtml ) { - throw new Error( 'Invalid sourceHtml' ); + /** + * Check if a range already exist in the array of ranges already located. + * A range is start position and length indicating position of certain text + * in a bigger text. + * This is not just a membership check. If the range we are checking + * falls under the start and end position of an already existing range, then also + * we consider it as an overlapping range. + * For example [start:5, length:4] and [start:6, length:3] overlaps. + * + * @param {Object} range + * @param {Object[]} rangeArray + * @return {boolean} Whether the range overlap or exist in any range in the given + * range array + */ + isOverlappingRange( range, rangeArray ) { + var rangeStart, rangeEnd, start, end; + + rangeStart = range.start; + rangeEnd = range.start + range.length; + for ( let i = 0; i < rangeArray.length; i++ ) { + start = rangeArray[ i ].start; + end = start + rangeArray[ i ].length; + if ( start >= rangeStart && end <= rangeEnd || + start <= rangeStart && end >= rangeEnd ) { + return true; + } + } + + return false; } - parser = new LinearDoc.Parser( { - // For the proper annotation mapping between source and translated content, - // we need to treat each sentence as isolated. - // In other words, trying to find mappings in a sentence context has better results - // compared to the mapping done in a whole paragraph content. - isolateSegments: true - } ); - parser.init(); - parser.write( sourceHtml ); - this.sourceHTML = sourceHtml; - this.sourceDoc = parser.builder.doc; -}; + /** + * Calculate range mappings based on the target text variants. + * + * @param {string} targetLang The target language. + * @param {Object[]} subSequences The start and length of each subsequence. + * @param {string} targetText The translated text. + * @param {Object} targetLines Translation of each subsequences. + * @return {Object[]} The location of source and translation sequences in the text. + * @return {number} Object.source.start {number} Start position of source subSequence in the text. + * @return {number} Object.source.length {number} Length of source subSequence in the text. + * @return {number} Object.target.start {number} Start position of sequence in the text. + * @return {number} Object.target.length {number} Length of matched sequence in the text. + */ + getSequenceMappings( targetLang, subSequences, targetText, targetLines ) { + var targetRange, sourceRange, subSequence, + rangeMappings = [], + targetRanges = [], + occurrences = {}; -/** - * Whether this engine needs authentication with JWT - * - * @return {boolean} - */ -MTClient.prototype.requiresAuthorization = function () { - return false; -}; + if ( subSequences.length !== targetLines.length ) { + // We must have translation for all subSequences. + throw new Error( 'Translation variants length mismatch' ); + } + + for ( let i = 0, iLen = subSequences.length; i < iLen; i++ ) { + subSequence = subSequences[ i ]; + sourceRange = { + start: subSequence.start, + length: subSequence.length + }; + // Keep track of repeated occurrences of a subsequence in the text. A word can repeat + // in a translation block. + occurrences[ subSequence.text ] = + occurrences[ subSequence.text ] === undefined ? 0 : occurrences[ subSequence.text ] + 1; + // Find the position of the translated subsequence in translated text. + // This involves a non-trivial fuzzy matching algorithm + targetRange = this.findSubSequence( + targetText, targetLines[ i ], targetLang, occurrences[ subSequence.text ] + ); + + if ( targetRange && !this.isOverlappingRange( targetRange, targetRanges ) ) { + // targetRanges keep track of all ranges we located. Used for overlap + // detection. + targetRanges.push( targetRange ); + rangeMappings.push( { + source: sourceRange, + target: targetRange + } ); + } + } + return rangeMappings; + } + + /** + * Locate the given sequence in the translated text. + * Example: + * Searching 'tropical' in 'They are subtropical and tropical flowers.', 'tropical', + * returns { start: 12, length: 8 } + * + * @param {string} text The translated text. + * @param {string} sequence The search string. + * @param {string} language Language of the text. Used for language specific matching. + * @param {number} occurrence Pass 1 for first occurrence, 2 for second occurrence, so on. + * @return {null|Object} The location of the sequence in the text. + * @return {null|number} Object.start {number} Start position of sequence in the text. + * @return {null|number} Object.lengthLength of matched sequence in the text. + */ + findSubSequence( text, sequence, language, occurrence ) { + var indices, matcher; + + matcher = new SubSequenceMatcher( language ); + indices = matcher.findFuzzyMatch( text, sequence ); + // Find the nth occurrence position + if ( !indices || indices.length < occurrence ) { + return null; + } + if ( occurrence === 0 ) { + return matcher.bestMatch( indices ); + } + return indices[ occurrence ]; + } + + /** + * Build the LinearDoc for the given source html + * + * @param {string} sourceHtml The html content + */ + buildSourceDoc( sourceHtml ) { + var parser; + + if ( this.sourceDoc ) { + return; + } + + if ( !sourceHtml ) { + throw new Error( 'Invalid sourceHtml' ); + } + + parser = new LinearDoc.Parser( { + // For the proper annotation mapping between source and translated content, + // we need to treat each sentence as isolated. + // In other words, trying to find mappings in a sentence context has better results + // compared to the mapping done in a whole paragraph content. + isolateSegments: true + } ); + parser.init(); + parser.write( sourceHtml ); + this.sourceHTML = sourceHtml; + this.sourceDoc = parser.builder.doc; + } + + /** + * Whether this engine needs authentication with JWT + * + * @return {boolean} + */ + requiresAuthorization() { + return false; + } +} module.exports = MTClient; diff --git a/lib/mt/Yandex.js b/lib/mt/Yandex.js index 00beeb4..53305dc 100644 --- a/lib/mt/Yandex.js +++ b/lib/mt/Yandex.js @@ -1,100 +1,89 @@ 'use strict'; -var - util = require( 'util' ), - preq = require( 'preq' ), - BBPromise = require( 'bluebird' ), +const preq = require( 'preq' ), MTClient = require( './MTClient.js' ), - yandexLanguageNameMap; + yandexLanguageNameMap = { + 'be-tarask': 'be', // T122033 + nb: 'no' // T132217 + }; -// Yandex language codes can differ from the language codes that -// we use. -yandexLanguageNameMap = { - 'be-tarask': 'be', // T122033 - nb: 'no' // T132217 -}; +class Yandex extends MTClient { -function Yandex( options ) { - this.logger = options.logger; - this.conf = options.conf; -} + /** + * Translate html or plain text content with Yandex. + * Yandex is capable of translating plain text and html with + * annotations mapping (keeps markup retained in translated content). + * Hence overriding translate method of MTClient. + * + * @param {string} sourceLang Source language code + * @param {string} targetLang Target language code + * @param {string} sourceText Source language text + * @return {Q.Promise} Target language text + */ + translate( sourceLang, targetLang, sourceText ) { + var key, postData; -util.inherits( Yandex, MTClient ); - -/** - * Translate html or plain text content with Yandex. - * Yandex is capable of translating plain text and html with - * annotations mapping (keeps markup retained in translated content). - * Hence overriding translate method of MTClient. - * - * @param {string} sourceLang Source language code - * @param {string} targetLang Target language code - * @param {string} sourceText Source language text - * @return {Q.Promise} Target language text - */ -Yandex.prototype.translate = function ( sourceLang, targetLang, sourceText ) { - var key, postData, self = this; - - key = this.conf.mt.yandex.key; - if ( key === null ) { - return BBPromise.reject( new Error( 'Yandex service is misconfigured' ) ); - } - - if ( sourceText.length > 10000 ) { - return BBPromise.reject( new Error( 'Source text too long: ' + - sourceLang + '-' + targetLang ) ); - } - - sourceLang = yandexLanguageNameMap[ sourceLang ] || sourceLang; - targetLang = yandexLanguageNameMap[ targetLang ] || targetLang; - - postData = { - uri: this.conf.mt.yandex.api + '/api/v1.5/tr.json/translate', - proxy: this.conf.proxy, - body: { - key: key, - lang: sourceLang + '-' + targetLang, - format: 'html', - text: sourceText + key = this.conf.mt.yandex.key; + if ( key === null ) { + return Promise.reject( new Error( 'Yandex service is misconfigured' ) ); } - }; - return preq.post( postData ).then( function ( response ) { - return response.body.text[ 0 ]; - } ).catch( function ( response ) { - throw new Error( 'Translation with Yandex failed. Error: ' + - self.getErrorName( response.body.code ) + ' ' + sourceLang + '-' + targetLang ); - } ); -}; + if ( sourceText.length > 10000 ) { + return Promise.reject( new Error( 'Source text too long: ' + + sourceLang + '-' + targetLang ) ); + } -/** - * Returns error name from error code. - * - * @param {number} code Error code - * @return {string} - */ -Yandex.prototype.getErrorName = function ( code ) { - // http://api.yandex.com/translate/doc/dg/reference/translate.xml - var errormap = { - 200: 'ERR_OK', - 401: 'ERR_KEY_INVALID', - 402: 'ERR_KEY_BLOCKED', - 403: 'ERR_DAILY_REQ_LIMIT_EXCEEDED', - 404: 'ERR_DAILY_CHAR_LIMIT_EXCEEDED', - 413: 'ERR_TEXT_TOO_LONG', - 422: 'ERR_UNPROCESSABLE_TEXT', - 501: 'ERR_LANG_NOT_SUPPORTED' - }; + sourceLang = yandexLanguageNameMap[ sourceLang ] || sourceLang; + targetLang = yandexLanguageNameMap[ targetLang ] || targetLang; - if ( code in errormap ) { - return errormap[ code ]; + postData = { + uri: this.conf.mt.yandex.api + '/api/v1.5/tr.json/translate', + proxy: this.conf.proxy, + body: { + key, + lang: sourceLang + '-' + targetLang, + format: 'html', + text: sourceText + } + }; + + return preq.post( postData ) + .then( ( response ) => response.body.text[ 0 ] ) + .catch( ( response ) => { + throw new Error( 'Translation with Yandex failed. Error: ' + + this.getErrorName( response.body.code ) + ' ' + sourceLang + '-' + targetLang ); + } ); } - return 'Unknown error'; -}; + /** + * Returns error name from error code. + * + * @param {number} code Error code + * @return {string} + */ + getErrorName( code ) { + // http://api.yandex.com/translate/doc/dg/reference/translate.xml + var errormap = { + 200: 'ERR_OK', + 401: 'ERR_KEY_INVALID', + 402: 'ERR_KEY_BLOCKED', + 403: 'ERR_DAILY_REQ_LIMIT_EXCEEDED', + 404: 'ERR_DAILY_CHAR_LIMIT_EXCEEDED', + 413: 'ERR_TEXT_TOO_LONG', + 422: 'ERR_UNPROCESSABLE_TEXT', + 501: 'ERR_LANG_NOT_SUPPORTED' + }; -Yandex.prototype.requiresAuthorization = function () { - return true; -}; + if ( code in errormap ) { + return errormap[ code ]; + } + + return 'Unknown error'; + } + + requiresAuthorization() { + return true; + } +} module.exports = Yandex; diff --git a/lib/mt/Youdao.js b/lib/mt/Youdao.js index 94cb4da..14ecc40 100644 --- a/lib/mt/Youdao.js +++ b/lib/mt/Youdao.js @@ -1,160 +1,147 @@ 'use strict'; -var - util = require( 'util' ), - preq = require( 'preq' ), +const preq = require( 'preq' ), LinearDoc = require( './../lineardoc' ), - BBPromise = require( 'bluebird' ), MTClient = require( './MTClient.js' ), - youdaoLanguageNameMap; + youdaoLanguageNameMap = { + 'en>zh': 'EN2ZH_CN', // English to Chinese Simplified + 'simple>zh': 'EN2ZH_CN', // English to Chinese Simplified + 'en>zh-cn': 'EN2ZH_CN', // English to Chinese Simplified + 'simple>zh-cn': 'EN2ZH_CN', // English to Chinese Simplified + 'ja>zh-cn': 'JA2ZH_CN', // Japanese to Chinese Simplified, + 'ja>zh': 'JA2ZH_CN', // Japanese to Chinese Simplified, + 'ko>zh-cn': 'KR2ZH_CN', // Korean to Chinese Simplified + 'fr>zh-cn': 'FR2ZH_CN', // Korean to Chinese Simplified + 'ru>zh-cn': 'RU2ZH_CN', // Russian to Chinese Simplified + 'es>zh-cn': 'SP2ZH_CN', // Spanish to Chinese Simplified + 'zh>en': 'ZH_CN2EN', // Chinese Simplified/Traditional to English + 'zh>simple': 'ZH_CN2EN', // Chinese Simplified/Traditional to Simple English + 'zh>ja': 'ZH_CN2JA', // Chinese Simplified/Traditional to Japanese + 'zh>ko': 'ZH_CN2KR', // Chinese Simplified/Traditional to Korean + 'zh>fr': 'ZH_CN2FR', // Chinese Simplified/Traditional to French + 'zh>ru': 'ZH_CN2RU', // Chinese Simplified/Traditional to Russian + 'zh>es': 'ZH_CN2SP' // Chinese Simplified/Traditional to Spanish + }; -// Youdao language codes differ from the language codes that we use. -youdaoLanguageNameMap = { - 'en>zh': 'EN2ZH_CN', // English to Chinese Simplified - 'simple>zh': 'EN2ZH_CN', // English to Chinese Simplified - 'en>zh-cn': 'EN2ZH_CN', // English to Chinese Simplified - 'simple>zh-cn': 'EN2ZH_CN', // English to Chinese Simplified - 'ja>zh-cn': 'JA2ZH_CN', // Japanese to Chinese Simplified, - 'ja>zh': 'JA2ZH_CN', // Japanese to Chinese Simplified, - 'ko>zh-cn': 'KR2ZH_CN', // Korean to Chinese Simplified - 'fr>zh-cn': 'FR2ZH_CN', // Korean to Chinese Simplified - 'ru>zh-cn': 'RU2ZH_CN', // Russian to Chinese Simplified - 'es>zh-cn': 'SP2ZH_CN', // Spanish to Chinese Simplified - 'zh>en': 'ZH_CN2EN', // Chinese Simplified/Traditional to English - 'zh>simple': 'ZH_CN2EN', // Chinese Simplified/Traditional to Simple English - 'zh>ja': 'ZH_CN2JA', // Chinese Simplified/Traditional to Japanese - 'zh>ko': 'ZH_CN2KR', // Chinese Simplified/Traditional to Korean - 'zh>fr': 'ZH_CN2FR', // Chinese Simplified/Traditional to French - 'zh>ru': 'ZH_CN2RU', // Chinese Simplified/Traditional to Russian - 'zh>es': 'ZH_CN2SP' // Chinese Simplified/Traditional to Spanish -}; +class Youdao extends MTClient { + /** + * Translate marked-up text + * Youdao does not support HTML translation. So we need to pass the plain text + * version. We are not piping this to translateText because we want to preseve + * the textblocks. But we cannot do annotation mapping because of complexity of + * segmentation for CJK languages. + * + * @param {string} sourceLang Source language code + * @param {string} targetLang Target language code + * @param {string} sourceHtml Source html + * @return {Promise} promise: Translated html + */ + translateHtml( sourceLang, targetLang, sourceHtml ) { + var i, len, targetDoc, chain = [], + self = this; -function Youdao( options ) { - this.logger = options.logger; - this.conf = options.conf; -} + this.buildSourceDoc( sourceHtml ); + // Clone and adapt sourceDoc + targetDoc = new LinearDoc.Doc( this.sourceDoc.wrapperTag ); -util.inherits( Youdao, MTClient ); + function translateItemDeferred( item ) { + if ( item.type !== 'textblock' ) { + return Promise.resolve( item ); + } -/** - * Translate marked-up text - * Youdao does not support HTML translation. So we need to pass the plain text - * version. We are not piping this to translateText because we want to preseve - * the textblocks. But we cannot do annotation mapping because of complexity of - * segmentation for CJK languages. - * - * @param {string} sourceLang Source language code - * @param {string} targetLang Target language code - * @param {string} sourceHtml Source html - * @return {Promise} promise: Translated html - */ -Youdao.prototype.translateHtml = function ( sourceLang, targetLang, sourceHtml ) { - var i, len, targetDoc, chain = [], - self = this; + return self.translateText( + sourceLang, + targetLang, + item.item.getPlainText() + ).then( function ( translated ) { + var newTextBlock; - this.buildSourceDoc( sourceHtml ); - // Clone and adapt sourceDoc - targetDoc = new LinearDoc.Doc( this.sourceDoc.wrapperTag ); + newTextBlock = item.item.translateTags( + translated, {} // Range mapping is empty. We dont do annotation mapping. + ); - function translateItemDeferred( item ) { - if ( item.type !== 'textblock' ) { - return BBPromise.resolve( item ); + return { + type: 'textblock', + item: newTextBlock + }; + } ); } - return self.translateText( - sourceLang, - targetLang, - item.item.getPlainText() - ).then( function ( translated ) { - var newTextBlock; + for ( i = 0, len = this.sourceDoc.items.length; i < len; i++ ) { + chain.push( translateItemDeferred( this.sourceDoc.items[ i ] ) ); + } - newTextBlock = item.item.translateTags( - translated, {} // Range mapping is empty. We dont do annotation mapping. - ); - - return { - type: 'textblock', - item: newTextBlock - }; + return Promise.all( chain ).then( ( results ) => { + targetDoc.items = results; + return targetDoc.getHtml(); } ); } - for ( i = 0, len = this.sourceDoc.items.length; i < len; i++ ) { - chain.push( translateItemDeferred( this.sourceDoc.items[ i ] ) ); - } + translateText( sourceLang, targetLang, sourceText ) { + var key, postData; - return BBPromise.all( chain ).then( function ( results ) { - targetDoc.items = results; - return targetDoc.getHtml(); - } ); -}; + key = this.conf.mt.youdao.key; + if ( key === null ) { + return Promise.reject( new Error( 'Youdao service is misconfigured' ) ); + } -Youdao.prototype.translateText = function ( sourceLang, targetLang, sourceText ) { - var self = this, - key, postData; - - key = this.conf.mt.youdao.key; - if ( key === null ) { - return BBPromise.reject( new Error( 'Youdao service is misconfigured' ) ); - } - - if ( sourceText.length > 10000 ) { - return BBPromise.reject( new Error( 'Source text too long: ' + + if ( sourceText.length > 10000 ) { + return Promise.reject( new Error( 'Source text too long: ' + sourceLang + '-' + targetLang ) ); + } + + postData = { + uri: this.conf.mt.youdao.api, + proxy: this.conf.proxy, + body: { + key: key, + type: 'data', + doctype: 'json', + q: sourceText, + l: youdaoLanguageNameMap[ sourceLang + '>' + targetLang ], + transtype: 'translate' + } + }; + + return preq.post( postData ).then( ( response ) => { + if ( response.body.errorCode === 0 ) { + return response.body.translation[ 0 ]; + } else { + throw new Error( 'Translation with Youdao failed. Error: ' + + this.getErrorName( response.body.errorCode ) + + ' ' + sourceLang + '>' + targetLang ); + } + } ); } - postData = { - uri: this.conf.mt.youdao.api, - proxy: this.conf.proxy, - body: { - key: key, - type: 'data', - doctype: 'json', - q: sourceText, - l: youdaoLanguageNameMap[ sourceLang + '>' + targetLang ], - transtype: 'translate' + /** + * Returns error name from error code. + * + * @param {number} code Error code + * @return {string} + */ + getErrorName( code ) { + var errormap = { + 10: 'Some sentence in source text is too long', + 11: 'No dictionay result', + 20: 'Source text too long', + 30: 'Server down', + 40: 'Unsupported language code', + 50: 'Invalid key', + 52: 'IP of the request is invalid', + 60: 'Reaching the spending limit for today', + 70: 'Insufficinent balance' + }; + + if ( code in errormap ) { + return errormap[ code ]; } - }; - return preq.post( postData ).then( function ( response ) { - if ( response.body.errorCode === 0 ) { - return response.body.translation[ 0 ]; - } else { - throw new Error( 'Translation with Youdao failed. Error: ' + - self.getErrorName( response.body.errorCode ) + - ' ' + sourceLang + '>' + targetLang ); - } - } ); -}; - -/** - * Returns error name from error code. - * - * @param {number} code Error code - * @return {string} - */ -Youdao.prototype.getErrorName = function ( code ) { - var errormap = { - 10: 'Some sentence in source text is too long', - 11: 'No dictionay result', - 20: 'Source text too long', - 30: 'Server down', - 40: 'Unsupported language code', - 50: 'Invalid key', - 52: 'IP of the request is invalid', - 60: 'Reaching the spending limit for today', - 70: 'Insufficinent balance' - }; - - if ( code in errormap ) { - return errormap[ code ]; + return 'Unknown error'; } - return 'Unknown error'; -}; - -Youdao.prototype.requiresAuthorization = function () { - return true; -}; - + requiresAuthorization() { + return true; + } +} module.exports = Youdao; -- To view, visit https://gerrit.wikimedia.org/r/372503 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0a9aaf9b71ee53d093956995cbe3073001f62081 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits