Santhosh has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/372120 )
Change subject: Add a sentence segment binary utility for debugging purposes ...................................................................... Add a sentence segment binary utility for debugging purposes Change-Id: I9cee55f28c219d7026935ae2837fa070c7c77338 --- A bin/segment 1 file changed, 28 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/20/372120/1 diff --git a/bin/segment b/bin/segment new file mode 100755 index 0000000..72724cc --- /dev/null +++ b/bin/segment @@ -0,0 +1,28 @@ +#!/usr/bin/env node +var script, xhtmlSource, xhtml, result, segmenter, + fs = require( 'fs' ), + Segmenter = require( __dirname + '/../lib/segmentation/CXSegmenter' ).CXSegmenter, + LinearDoc = require( __dirname + '/../lib/lineardoc' ); + +function normalize( html ) { + var normalizer = new LinearDoc.Normalizer(); + normalizer.init(); + normalizer.write( html.replace( /(\r\n|\n|\t|\r)/gm, '' ) ); + return normalizer.getHtml(); +} + +script = process.argv[ 1 ]; +if ( process.argv.length !== 3 ) { + process.stderr.write( + 'Usage: node ' + script + ' xhtmlSource\n' + + 'xhtml must be wrapped in a block element such as <p>...</p> or <div>..</div>.\n' + ); + process.exit( 1 ); +} + +xhtmlSource = process.argv[ 2 ]; +xhtml = fs.readFileSync( xhtmlSource, 'utf8' ); +segmenter = new Segmenter( xhtml, 'en' ); +segmenter.segment(); +result = normalize( segmenter.getSegmentedContent() ); +process.stdout.write( result + '\n' ); -- To view, visit https://gerrit.wikimedia.org/r/372120 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I9cee55f28c219d7026935ae2837fa070c7c77338 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits