jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/372120 )
Change subject: Add a sentence segment binary utility for debugging purposes
......................................................................
Add a sentence segment binary utility for debugging purposes
Change-Id: I9cee55f28c219d7026935ae2837fa070c7c77338
---
A bin/segment
1 file changed, 28 insertions(+), 0 deletions(-)
Approvals:
Catrope: Looks good to me, approved
jenkins-bot: Verified
diff --git a/bin/segment b/bin/segment
new file mode 100755
index 0000000..0ccb3a5
--- /dev/null
+++ b/bin/segment
@@ -0,0 +1,28 @@
+#!/usr/bin/env node
+var script, xhtmlSource, xhtml, result, segmenter,
+ fs = require( 'fs' ),
+ Segmenter = require( __dirname + '/../lib/segmentation/CXSegmenter'
).CXSegmenter,
+ LinearDoc = require( __dirname + '/../lib/lineardoc' );
+
+function normalize( html ) {
+ var normalizer = new LinearDoc.Normalizer();
+ normalizer.init();
+ normalizer.write( html.replace( /(\r\n|\n|\t|\r)/gm, '' ) );
+ return normalizer.getHtml();
+}
+
+html = fs.readFileSync( '/dev/stdin', 'utf8' );
+if ( html.trim() === '' ) {
+ script = process.argv[ 1 ];
+ process.stderr.write(
+ 'Usage: node ' + script + ' < file\n' +
+ 'Input must be wrapped in a block element such as <p>...</p> or
<div>..</div>.\n'
+ );
+ process.exit( 1 );
+
+}
+
+segmenter = new Segmenter( html, 'en' );
+segmenter.segment();
+result = normalize( segmenter.getSegmentedContent() );
+process.stdout.write( result + '\n' );
--
To view, visit https://gerrit.wikimedia.org/r/372120
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9cee55f28c219d7026935ae2837fa070c7c77338
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>
Gerrit-Reviewer: Catrope <[email protected]>
Gerrit-Reviewer: Divec <[email protected]>
Gerrit-Reviewer: Nikerabbit <[email protected]>
Gerrit-Reviewer: Santhosh <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits