[MediaWiki-commits] [Gerrit] Segmentation: Chinese segmentation - change (mediawiki...cxserver)

jenkins-bot (Code Review) Tue, 17 Feb 2015 22:15:35 -0800

jenkins-bot has submitted this change and it was merged.

Change subject: Segmentation: Chinese segmentation
......................................................................



Segmentation: Chinese segmentation

* Adds code for Chinese segmentation and test files

Bug: T89338
Change-Id: I1790064f35a3e1e5a32d1463527e5937c1a11176
---
A segmentation/languages/SegmenterZh.js
M segmentation/languages/index.js
M tests/segmentation/SegmentationTests.json
A tests/segmentation/data/result-20.html
A tests/segmentation/data/test-20.html
5 files changed, 44 insertions(+), 0 deletions(-)

Approvals:
  Amire80: Looks good to me, but someone else must approve
  Santhosh: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/segmentation/languages/SegmenterZh.js 
b/segmentation/languages/SegmenterZh.js
new file mode 100644
index 0000000..0dca2b9
--- /dev/null
+++ b/segmentation/languages/SegmenterZh.js
@@ -0,0 +1,33 @@
+var findAll = require( '../../lineardoc' ).Utils.findAll;
+
+/**
+ * Test a possible Chinese sentence boundary match
+ *
+ * @param {string} text The plaintext to segment
+ * @param {Object} match The possible boundary match (returned by regex.exec)
+ * @return {number|null} The boundary offset, or null if not a sentence 
boundary
+ */
+function findBoundary( text, match ) {
+       var tail = text.slice( match.index + 1, text.length );
+
+       // Include any closing quotes.
+       return match.index + 1 + tail.match( /^[」』”’]*/ )[ 0 ].length;
+}
+
+/**
+ * Find Chinese sentence boundaries
+ *
+ * @param {string} text The plaintext to segment
+ * @returns {number[]} Sentence boundary offsets
+ */
+function getBoundaries( text ) {
+       // Regex to find possible Chinese sentence boundaries:
+       // The Chinese full width '。', '！' and '？' are checked.
+       // This is the character that is used in Chinese Wikipedia for sentence 
ending.
+       // Must not use a shared regex instance (re.lastIndex is used)
+       return findAll( text, /[。！？]/g, findBoundary );
+}
+
+module.exports = {
+       getBoundaries: getBoundaries
+};
diff --git a/segmentation/languages/index.js b/segmentation/languages/index.js
index 7226066..cdca08b 100644
--- a/segmentation/languages/index.js
+++ b/segmentation/languages/index.js
@@ -3,6 +3,7 @@
 module.exports.Segmenters = {
        en: require( __dirname + '/SegmenterEn.js' ),
        ja: require( __dirname + '/SegmenterJa.js' ),
+       zh: require( __dirname + '/SegmenterZh.js' ),
        hi: require( __dirname + '/SegmenterHi.js' ),
        sa: require( __dirname + '/SegmenterHi.js' ),
        default: require( __dirname + '/SegmenterDefault.js' )
diff --git a/tests/segmentation/SegmentationTests.json 
b/tests/segmentation/SegmentationTests.json
index 49af275..d8f7354 100644
--- a/tests/segmentation/SegmentationTests.json
+++ b/tests/segmentation/SegmentationTests.json
@@ -115,5 +115,13 @@
                        "source": "test-19.html",
                        "result": "result-19.html"
                }
+       ],
+       "zh": [
+               {
+                       "desc": "Chinese segmentation - basic test",
+                       "source": "test-20.html",
+                       "result": "result-20.html"
+               }
        ]
+
 }
diff --git a/tests/segmentation/data/result-20.html 
b/tests/segmentation/data/result-20.html
new file mode 100644
index 0000000..7e67908
--- /dev/null
+++ b/tests/segmentation/data/result-20.html
@@ -0,0 +1 @@
+<p id="0"><span class="cx-segment" data-segmentid="1">《史記》內容記載自傳說中的<a 
class="cx-link" data-linkid="2" href="/wiki/%E9%BB%83%E5%B8%9D" 
title="黃帝">黃帝</a>以來至<a class="cx-link" data-linkid="3" 
href="/wiki/%E6%BC%A2%E6%AD%A6%E5%B8%9D" title="漢武帝">漢武帝</a>時期以來的歷史，共分成〈<b><a 
class="cx-link" data-linkid="4" href="/wiki/%E6%9C%AC%E7%B4%80" 
title="本紀">本紀</a></b>〉、〈<b><a class="cx-link" data-linkid="5" 
href="/wiki/%E8%A1%A8_(%E5%8F%B2%E6%9B%B8)" title="表 
(史書)">表</a></b>〉、〈<b>書</b>〉、〈<b><a class="cx-link" data-linkid="6" 
href="/wiki/%E4%B8%96%E5%AE%B6" title="世家">世家</a></b>〉和〈<b><a class="cx-link" 
data-linkid="7" href="/wiki/%E5%88%97%E5%82%B3" 
title="列傳">列傳</a></b>〉五個主題，加上最後的〈太史公自序〉又細分成一百三十個章節。</span><span 
class="cx-segment" 
data-segmentid="8">其中，〈<b>本紀</b>〉是“天下”統治者的事蹟，“网罗天下放失旧闻，王迹所兴，原始察终，见盛观衰……著十二本纪，既科条之矣。”</span><span
 class="cx-segment" data-segmentid="9">；〈<b><a class="cx-link" data-linkid="10" 
href="/wiki/%E8%A1%A8_(%E5%8F%B2%E6%9B%B8)" title="表 
(史書)">表</a></b>〉以表格的方式排列整理事件次序或歷史動態，“并时异世，年差不明，作十表。”</span><span 
class="cx-segment" 
data-segmentid="11">；〈<b>書</b>〉的內容有關歷代典章制度，“礼乐损益，律历改易，兵权山川鬼神，天人之际，承敝通变，作八书”；〈<b>世家</b>〉描述影響深遠的家系或貴族事蹟，“二十八宿环北辰，三十辐共一毂，运行无穷。</span><span
 class="cx-segment" data-segmentid="12">辅拂股肱之臣配焉，忠信行道，以奉主上，作三十世家。”</span><span 
class="cx-segment" 
data-segmentid="13">；〈<b>列傳</b>〉呈現的是歷史上各類人物的歷史表現與社會的種種樣貌，“扶义倜傥，不令己失时，立功名于天下，作七十列传。”</span><span
 class="cx-segment" data-segmentid="14">。</span><span class="cx-segment" 
data-segmentid="15">不同於以往的史書，《史記》的寫作方式首開<a class="cx-link" data-linkid="16" 
href="/wiki/%E7%B4%80%E5%82%B3%E9%AB%94" 
title="紀傳體">紀傳體</a>之先河：以描寫人物的生平為主，年代先後為副。</span><span class="cx-segment" 
data-segmentid="17">至此以後，尚有《漢書》、《三國志》和《後漢書》等史著仿效該體，讓紀傳體成為<a class="cx-link" 
data-linkid="18" href="/wiki/%E5%94%90%E4%BB%A3" 
title="唐代">唐代</a>以後官方史著所採用的主流寫作方式。</span><span class="cx-segment" 
data-segmentid="19"><a class="cx-link" data-linkid="20" 
href="/wiki/%E8%B6%99%E7%BF%BC" title="趙翼">趙翼</a>《<a class="cx-link" 
data-linkid="21" href="/wiki/%E5%BB%BF%E4%BA%8C%E5%8F%B2%E5%8A%84%E8%A8%98" 
title="廿二史劄記">廿二史劄記</a>》云：「司馬遷參酌古今，發凡起例，創為全史，本紀以序帝王，世家以記侯國，十表以繫時事，八書以詳制度，列傳以誌人物，然後一代君臣政事賢否得失，總彙於一編之中。</span><span
 class="cx-segment" data-segmentid="22">自此例一定，歷代作史者，遂不能出其範圍，信史家之極則也。」</span></p>
diff --git a/tests/segmentation/data/test-20.html 
b/tests/segmentation/data/test-20.html
new file mode 100644
index 0000000..89d30cc
--- /dev/null
+++ b/tests/segmentation/data/test-20.html
@@ -0,0 +1 @@
+<p>《史記》內容記載自傳說中的<a href="/wiki/%E9%BB%83%E5%B8%9D" title="黃帝" 
class="mw-redirect">黃帝</a>以來至<a href="/wiki/%E6%BC%A2%E6%AD%A6%E5%B8%9D" 
title="漢武帝" class="mw-redirect">漢武帝</a>時期以來的歷史，共分成〈<b><a 
href="/wiki/%E6%9C%AC%E7%B4%80" title="本紀">本紀</a></b>〉、〈<b><a 
href="/wiki/%E8%A1%A8_(%E5%8F%B2%E6%9B%B8)" title="表 
(史書)">表</a></b>〉、〈<b>書</b>〉、〈<b><a href="/wiki/%E4%B8%96%E5%AE%B6" title="世家" 
class="mw-disambig">世家</a></b>〉和〈<b><a href="/wiki/%E5%88%97%E5%82%B3" 
title="列傳">列傳</a></b>〉五個主題，加上最後的〈太史公自序〉又細分成一百三十個章節。其中，〈<b>本紀</b>〉是“天下”統治者的事蹟，“网罗天下放失旧闻，王迹所兴，原始察终，见盛观衰……著十二本纪，既科条之矣。”；〈<b><a
 href="/wiki/%E8%A1%A8_(%E5%8F%B2%E6%9B%B8)" title="表 
(史書)">表</a></b>〉以表格的方式排列整理事件次序或歷史動態，“并时异世，年差不明，作十表。”；〈<b>書</b>〉的內容有關歷代典章制度，“礼乐损益，律历改易，兵权山川鬼神，天人之际，承敝通变，作八书”；〈<b>世家</b>〉描述影響深遠的家系或貴族事蹟，“二十八宿环北辰，三十辐共一毂，运行无穷。辅拂股肱之臣配焉，忠信行道，以奉主上，作三十世家。”；〈<b>列傳</b>〉呈現的是歷史上各類人物的歷史表現與社會的種種樣貌，“扶义倜傥，不令己失时，立功名于天下，作七十列传。”。不同於以往的史書，《史記》的寫作方式首開<a
 href="/wiki/%E7%B4%80%E5%82%B3%E9%AB%94" title="紀傳體" 
class="mw-redirect">紀傳體</a>之先河：以描寫人物的生平為主，年代先後為副。至此以後，尚有《漢書》、《三國志》和《後漢書》等史著仿效該體，讓紀傳體成為<a
 href="/wiki/%E5%94%90%E4%BB%A3" title="唐代" 
class="mw-redirect">唐代</a>以後官方史著所採用的主流寫作方式。<a href="/wiki/%E8%B6%99%E7%BF%BC" 
title="趙翼">趙翼</a>《<a href="/wiki/%E5%BB%BF%E4%BA%8C%E5%8F%B2%E5%8A%84%E8%A8%98" 
title="廿二史劄記" 
class="mw-redirect">廿二史劄記</a>》云：「司馬遷參酌古今，發凡起例，創為全史，本紀以序帝王，世家以記侯國，十表以繫時事，八書以詳制度，列傳以誌人物，然後一代君臣政事賢否得失，總彙於一編之中。自此例一定，歷代作史者，遂不能出其範圍，信史家之極則也。」</p>

-- 
To view, visit https://gerrit.wikimedia.org/r/191193
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I1790064f35a3e1e5a32d1463527e5937c1a11176
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Jsahleen <jsahl...@wikimedia.org>
Gerrit-Reviewer: Amire80 <amir.ahar...@mail.huji.ac.il>
Gerrit-Reviewer: Santhosh <santhosh.thottin...@gmail.com>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Segmentation: Chinese segmentation - change (mediawiki...cxserver)

Reply via email to