Santhosh has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/372120 )
Change subject: Add a sentence segment binary utility for debugging purposes
......................................................................
Add a sentence segment binary utility for debugging purposes
Change-Id: I9cee55f28c219d7026935ae2837fa070c7c77338
---
A bin/segment
1 file changed, 28 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver
refs/changes/20/372120/1
diff --git a/bin/segment b/bin/segment
new file mode 100755
index 0000000..72724cc
--- /dev/null
+++ b/bin/segment
@@ -0,0 +1,28 @@
+#!/usr/bin/env node
+var script, xhtmlSource, xhtml, result, segmenter,
+ fs = require( 'fs' ),
+ Segmenter = require( __dirname + '/../lib/segmentation/CXSegmenter'
).CXSegmenter,
+ LinearDoc = require( __dirname + '/../lib/lineardoc' );
+
+function normalize( html ) {
+ var normalizer = new LinearDoc.Normalizer();
+ normalizer.init();
+ normalizer.write( html.replace( /(\r\n|\n|\t|\r)/gm, '' ) );
+ return normalizer.getHtml();
+}
+
+script = process.argv[ 1 ];
+if ( process.argv.length !== 3 ) {
+ process.stderr.write(
+ 'Usage: node ' + script + ' xhtmlSource\n' +
+ 'xhtml must be wrapped in a block element such as <p>...</p> or
<div>..</div>.\n'
+ );
+ process.exit( 1 );
+}
+
+xhtmlSource = process.argv[ 2 ];
+xhtml = fs.readFileSync( xhtmlSource, 'utf8' );
+segmenter = new Segmenter( xhtml, 'en' );
+segmenter.segment();
+result = normalize( segmenter.getSegmentedContent() );
+process.stdout.write( result + '\n' );
--
To view, visit https://gerrit.wikimedia.org/r/372120
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I9cee55f28c219d7026935ae2837fa070c7c77338
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits