Amire80 has uploaded a new change for review. https://gerrit.wikimedia.org/r/211400
Change subject: Add segmentation for Ethiopic languages ...................................................................... Add segmentation for Ethiopic languages Bug: T98345 Change-Id: I3a47e630dcbaff8f5b9c60b1a4d48b3db7d5606d --- A segmentation/languages/SegmenterAm.js M segmentation/languages/index.js M tests/segmentation/SegmentationTests.json A tests/segmentation/data/result-23.html A tests/segmentation/data/test-23.html 5 files changed, 54 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/00/211400/1 diff --git a/segmentation/languages/SegmenterAm.js b/segmentation/languages/SegmenterAm.js new file mode 100644 index 0000000..e3bdf38 --- /dev/null +++ b/segmentation/languages/SegmenterAm.js @@ -0,0 +1,43 @@ +var findAll = require( '../../lineardoc' ).Utils.findAll; + +/** + * Test a possible Amharic sentence boundary match + * + * @param {string} text The plaintext to segment + * @param {Object} match The possible boundary match (returned by regex.exec) + * @return {number|null} The boundary offset, or null if not a sentence boundary + */ + +function findBoundary( text, match ) { + var tail = text.slice( match.index + 1, text.length ); + + // Trailing non-final punctuation: not a sentence boundary + if ( tail.match( /^[,;:]/ ) ) { + return null; + } + + // Next word character is number or lower-case: not a sentence boundary + if ( tail.match( /^\W*[0-9a-z]/ ) ) { + return null; + } + + // Include any closing punctuation and trailing space + return match.index + 1 + tail.match( /^['”"’]*\s*/ )[ 0 ].length; +} + +/** + * Find English sentence boundaries + * + * @param {string} text The plaintext to segment + * @returns {number[]} Sentence boundary offsets + */ +function getBoundaries( text ) { + // Regex to find possible English sentence boundaries. + // Must not use a shared regex instance (re.lastIndex is used). + // In the Ethiopic script ። is used as a full stop. + return findAll( text, /[።!?]/g, findBoundary ); +} + +module.exports = { + getBoundaries: getBoundaries +}; diff --git a/segmentation/languages/index.js b/segmentation/languages/index.js index 766d654..f5e623b 100644 --- a/segmentation/languages/index.js +++ b/segmentation/languages/index.js @@ -1,12 +1,14 @@ 'use strict'; module.exports.Segmenters = { + am: require( __dirname + '/SegmenterAm.js' ), en: require( __dirname + '/SegmenterEn.js' ), hi: require( __dirname + '/SegmenterHi.js' ), hy: require( __dirname + '/SegmenterHy.js' ), ja: require( __dirname + '/SegmenterJa.js' ), pa: require( __dirname + '/SegmenterHi.js' ), sa: require( __dirname + '/SegmenterHi.js' ), + ti: require( __dirname + '/SegmenterAm.js' ), zh: require( __dirname + '/SegmenterZh.js' ), default: require( __dirname + '/SegmenterDefault.js' ) }; diff --git a/tests/segmentation/SegmentationTests.json b/tests/segmentation/SegmentationTests.json index 2f6eb3c..9ca712a 100644 --- a/tests/segmentation/SegmentationTests.json +++ b/tests/segmentation/SegmentationTests.json @@ -1,4 +1,11 @@ { + "am": [ + { + "desc": "Amharic segmentation - basic test", + "source": "test-23.html", + "result": "result-23.html" + } + ], "en": [ { "desc": "Simple paragraph test", diff --git a/tests/segmentation/data/result-23.html b/tests/segmentation/data/result-23.html new file mode 100644 index 0000000..19bdb26 --- /dev/null +++ b/tests/segmentation/data/result-23.html @@ -0,0 +1 @@ +<p id="0"><span class="cx-segment" data-segmentid="1">ቴዎድሮስ <a class="cx-link" data-linkid="2" href="/wiki/%E1%8C%A5%E1%88%AD_%E1%8D%AE" title="ጥር ፮">ጥር ፮</a> ቀን <a class="cx-link" data-linkid="3" href="/w/index.php?title=1811&action=edit&redlink=1" title="1811 (ገጹ ገና አልተጻፈም)">፲፰፻፲፩</a> ዓ.ም. ሻርጌ በተባለ ቦታ <a class="cx-link" data-linkid="4" href="/wiki/%E1%89%8B%E1%88%AB" title="ቋራ">ቋራ</a> ውስጥ፣ ከ<a class="cx-link" data-linkid="5" href="/wiki/%E1%8C%8E%E1%8A%95%E1%8B%B0%E1%88%AD" title="ጎንደር">ጎንደር ከተማ</a> በስተ ምዕራብ ተወለዱ። </span><span class="cx-segment" data-segmentid="6">የተወለዱትም አገሪቷ በባላባቶች ተከፋፍላ በምትመራበት-<a class="cx-link" data-linkid="7" href="/wiki/%E1%8B%98%E1%88%98%E1%8A%90_%E1%88%98%E1%88%B3%E1%8D%8D%E1%8A%95%E1%89%B5" title="ዘመነ መሳፍንት">ዘመነ መሳፍንት</a> በሚባለው ወቅት ነበር። </span><span class="cx-segment" data-segmentid="8">አባታቸው <a class="cx-link" data-linkid="9" href="/w/index.php?title=%E1%8B%B0%E1%8C%83%E1%8B%9D%E1%88%9B%E1%89%BD&action=edit&redlink=1" title="ደጃዝማች (ገጹ ገና አልተጻፈም)">ደጃዝማች</a> <a class="cx-link" data-linkid="10" href="/w/index.php?title=%E1%8A%83%E1%8B%AD%E1%88%89_%E1%8B%88%E1%88%8D%E1%8B%B0_%E1%8C%8A%E1%8B%AE%E1%88%AD%E1%8C%8A%E1%88%B5&action=edit&redlink=1" title="ኃይሉ ወልደ ጊዮርጊስ (ገጹ ገና አልተጻፈም)">ኃይሉ ወልደ ጊዮርጊስ</a> የቋራ ገዢ ነበሩ። </span><span class="cx-segment" data-segmentid="11">ዓፄ ቴዎድሮስ በህጻንነታቸው የቄስ ትምህርት ከቀሰሙ በኋላ፣ የአጎታቸውንና በኋላም ለጥቂት ጊዜ የ<a class="cx-link" data-linkid="12" href="/wiki/%E1%8C%8E%E1%8C%83%E1%88%9D" title="ጎጃም">ጎጃሙን</a> ጦር መሪ የ<a class="cx-link" data-linkid="13" href="/w/index.php?title=%E1%8C%8E%E1%88%B9_%E1%8B%98%E1%8B%8D%E1%8B%B4&action=edit&redlink=1" title="ጎሹ ዘውዴ (ገጹ ገና አልተጻፈም)">ጎሹ ዘውዴ</a>ን ጦር ተቀላቀሉ። </span><span class="cx-segment" data-segmentid="14">በዚሁ የውትድርና ዘመናቸው ከፍተኛ ችሎታን ማስመዝገብ ስለጀመሩና ዝናቸው ስለተስፋፋ በ<a class="cx-link" data-linkid="15" href="/w/index.php?title=1839&action=edit&redlink=1" title="1839 (ገጹ ገና አልተጻፈም)">፲፰፻፴፱</a> ዓ.ም. በወይዘሮ <a class="cx-link" data-linkid="16" href="/w/index.php?title=%E1%88%98%E1%8A%90%E1%8A%95_%E1%88%8A%E1%89%A0%E1%8A%95_%E1%8A%A0%E1%88%9D%E1%8B%B4&action=edit&redlink=1" title="መነን ሊበን አምዴ (ገጹ ገና አልተጻፈም)">መነን ሊበን አምዴ</a> አነሳሽነት የልጇን የራስ <a class="cx-link" data-linkid="17" href="/w/index.php?title=%E1%8A%A0%E1%88%8A_%E1%8A%A0%E1%88%89%E1%88%8B&action=edit&redlink=1" title="አሊ አሉላ (ገጹ ገና አልተጻፈም)">አሊ አሉላ</a>ን ልጅ፣ <a class="cx-link" data-linkid="18" href="/w/index.php?title=%E1%89%B0%E1%8B%8B%E1%89%A0%E1%89%BD_%E1%8A%A0%E1%88%8A&action=edit&redlink=1" title="ተዋበች አሊ (ገጹ ገና አልተጻፈም)">ተዋበች አሊ</a>ን ተዳሩ፤ እንዲሁም በ<a class="cx-link" data-linkid="19" href="/w/index.php?title=%E1%8B%B0%E1%8C%83%E1%8B%9D%E1%88%9B%E1%89%BD&action=edit&redlink=1" title="ደጃዝማች (ገጹ ገና አልተጻፈም)">ደጃዝማች</a>ነት ማዕረግ የቋራ አስተዳዳሪ ሆነው ተሾሙ። </span><span class="cx-segment" data-segmentid="20">ቴዎድሮስ ግን በ<a class="cx-link" data-linkid="21" href="/w/index.php?title=1844&action=edit&redlink=1" title="1844 (ገጹ ገና አልተጻፈም)">፲፰፻፵፬</a> ዓ.ም. አጠቃላይ የዘመነ መሳፍንት ሥርዓትን በመቃወም በሰሜናዊ ባላባቶች ላይ ዘመቻ ጀመሩ። </span><span class="cx-segment" data-segmentid="22">በኒህ ተከታታይ ዘመቻወች የገጠሟቸውን ባላባቶች ስላሸነፉ፣ መጀመሪያ የ<a class="cx-link" data-linkid="23" href="/wiki/%E1%88%AB%E1%88%B5" title="ራስ">ራስ</a> ማዕረግን በኋላም የ<a class="cx-link" data-linkid="24" href="/wiki/%E1%8A%95%E1%8C%89%E1%88%A5" title="ንጉሥ">ንጉሥ</a> ማዕረግን በአንድ ዓመት ውስጥ ተቀዳጁ። </span><span class="cx-segment" data-segmentid="25">በየጊዜው በሚያደርጉት የተሳካ ዘመቻ የዘመኑን ባላባቶች ኃይል በመሰባበር <a class="cx-link" data-linkid="26" href="/wiki/%E1%8B%A8%E1%8A%AB%E1%89%B2%E1%89%B5_3" title="የካቲት 3">የካቲት ፫</a> ቀን <a class="cx-link" data-linkid="27" href="/w/index.php?title=1847&action=edit&redlink=1" title="1847 (ገጹ ገና አልተጻፈም)">፲፰፻፵፯</a> ዓ.ም ንጉሥ ካሳ - ዳግማዊ ዓፄ ቴዎድሮስ ተብለው የኢትዮጵያ ንጉሠ ነገሥት ሆኑ።</span></p> diff --git a/tests/segmentation/data/test-23.html b/tests/segmentation/data/test-23.html new file mode 100644 index 0000000..6979b02 --- /dev/null +++ b/tests/segmentation/data/test-23.html @@ -0,0 +1 @@ +<p>ቴዎድሮስ <a href="/wiki/%E1%8C%A5%E1%88%AD_%E1%8D%AE" title="ጥር ፮">ጥር ፮</a> ቀን <a href="/w/index.php?title=1811&action=edit&redlink=1" class="new" title="1811 (ገጹ ገና አልተጻፈም)">፲፰፻፲፩</a> ዓ.ም. ሻርጌ በተባለ ቦታ <a href="/wiki/%E1%89%8B%E1%88%AB" title="ቋራ">ቋራ</a> ውስጥ፣ ከ<a href="/wiki/%E1%8C%8E%E1%8A%95%E1%8B%B0%E1%88%AD" title="ጎንደር" class="mw-redirect">ጎንደር ከተማ</a> በስተ ምዕራብ ተወለዱ። የተወለዱትም አገሪቷ በባላባቶች ተከፋፍላ በምትመራበት-<a href="/wiki/%E1%8B%98%E1%88%98%E1%8A%90_%E1%88%98%E1%88%B3%E1%8D%8D%E1%8A%95%E1%89%B5" title="ዘመነ መሳፍንት">ዘመነ መሳፍንት</a> በሚባለው ወቅት ነበር። አባታቸው <a href="/w/index.php?title=%E1%8B%B0%E1%8C%83%E1%8B%9D%E1%88%9B%E1%89%BD&action=edit&redlink=1" class="new" title="ደጃዝማች (ገጹ ገና አልተጻፈም)">ደጃዝማች</a> <a href="/w/index.php?title=%E1%8A%83%E1%8B%AD%E1%88%89_%E1%8B%88%E1%88%8D%E1%8B%B0_%E1%8C%8A%E1%8B%AE%E1%88%AD%E1%8C%8A%E1%88%B5&action=edit&redlink=1" class="new" title="ኃይሉ ወልደ ጊዮርጊስ (ገጹ ገና አልተጻፈም)">ኃይሉ ወልደ ጊዮርጊስ</a> የቋራ ገዢ ነበሩ። ዓፄ ቴዎድሮስ በህጻንነታቸው የቄስ ትምህርት ከቀሰሙ በኋላ፣ የአጎታቸውንና በኋላም ለጥቂት ጊዜ የ<a href="/wiki/%E1%8C%8E%E1%8C%83%E1%88%9D" title="ጎጃም">ጎጃሙን</a> ጦር መሪ የ<a href="/w/index.php?title=%E1%8C%8E%E1%88%B9_%E1%8B%98%E1%8B%8D%E1%8B%B4&action=edit&redlink=1" class="new" title="ጎሹ ዘውዴ (ገጹ ገና አልተጻፈም)">ጎሹ ዘውዴ</a>ን ጦር ተቀላቀሉ። በዚሁ የውትድርና ዘመናቸው ከፍተኛ ችሎታን ማስመዝገብ ስለጀመሩና ዝናቸው ስለተስፋፋ በ<a href="/w/index.php?title=1839&action=edit&redlink=1" class="new" title="1839 (ገጹ ገና አልተጻፈም)">፲፰፻፴፱</a> ዓ.ም. በወይዘሮ <a href="/w/index.php?title=%E1%88%98%E1%8A%90%E1%8A%95_%E1%88%8A%E1%89%A0%E1%8A%95_%E1%8A%A0%E1%88%9D%E1%8B%B4&action=edit&redlink=1" class="new" title="መነን ሊበን አምዴ (ገጹ ገና አልተጻፈም)">መነን ሊበን አምዴ</a> አነሳሽነት የልጇን የራስ <a href="/w/index.php?title=%E1%8A%A0%E1%88%8A_%E1%8A%A0%E1%88%89%E1%88%8B&action=edit&redlink=1" class="new" title="አሊ አሉላ (ገጹ ገና አልተጻፈም)">አሊ አሉላ</a>ን ልጅ፣ <a href="/w/index.php?title=%E1%89%B0%E1%8B%8B%E1%89%A0%E1%89%BD_%E1%8A%A0%E1%88%8A&action=edit&redlink=1" class="new" title="ተዋበች አሊ (ገጹ ገና አልተጻፈም)">ተዋበች አሊ</a>ን ተዳሩ፤ እንዲሁም በ<a href="/w/index.php?title=%E1%8B%B0%E1%8C%83%E1%8B%9D%E1%88%9B%E1%89%BD&action=edit&redlink=1" class="new" title="ደጃዝማች (ገጹ ገና አልተጻፈም)">ደጃዝማች</a>ነት ማዕረግ የቋራ አስተዳዳሪ ሆነው ተሾሙ። ቴዎድሮስ ግን በ<a href="/w/index.php?title=1844&action=edit&redlink=1" class="new" title="1844 (ገጹ ገና አልተጻፈም)">፲፰፻፵፬</a> ዓ.ም. አጠቃላይ የዘመነ መሳፍንት ሥርዓትን በመቃወም በሰሜናዊ ባላባቶች ላይ ዘመቻ ጀመሩ። በኒህ ተከታታይ ዘመቻወች የገጠሟቸውን ባላባቶች ስላሸነፉ፣ መጀመሪያ የ<a href="/wiki/%E1%88%AB%E1%88%B5" title="ራስ">ራስ</a> ማዕረግን በኋላም የ<a href="/wiki/%E1%8A%95%E1%8C%89%E1%88%A5" title="ንጉሥ">ንጉሥ</a> ማዕረግን በአንድ ዓመት ውስጥ ተቀዳጁ። በየጊዜው በሚያደርጉት የተሳካ ዘመቻ የዘመኑን ባላባቶች ኃይል በመሰባበር <a href="/wiki/%E1%8B%A8%E1%8A%AB%E1%89%B2%E1%89%B5_3" title="የካቲት 3" class="mw-redirect">የካቲት ፫</a> ቀን <a href="/w/index.php?title=1847&action=edit&redlink=1" class="new" title="1847 (ገጹ ገና አልተጻፈም)">፲፰፻፵፯</a> ዓ.ም ንጉሥ ካሳ - ዳግማዊ ዓፄ ቴዎድሮስ ተብለው የኢትዮጵያ ንጉሠ ነገሥት ሆኑ።</p> -- To view, visit https://gerrit.wikimedia.org/r/211400 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I3a47e630dcbaff8f5b9c60b1a4d48b3db7d5606d Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Amire80 <amir.ahar...@mail.huji.ac.il> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits