This is an automated email from the ASF dual-hosted git repository.

myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git


The following commit(s) were added to refs/heads/master by this push:
     new 141862c  [HIVEMALL-307][DOC] Update tokenize_ko examples
141862c is described below

commit 141862c58b5d82bd313b877c703b2d8c3a39d0b1
Author: Makoto Yui <[email protected]>
AuthorDate: Fri May 14 12:25:13 2021 +0900

    [HIVEMALL-307][DOC] Update tokenize_ko examples
    
    ## What changes were proposed in this pull request?
    
    Update tokenize_ko examples
    
    ## What type of PR is it?
    
    Documentation
    
    ## What is the Jira issue?
    
    https://issues.apache.org/jira/browse/HIVEMALL-307
    
    Author: Makoto Yui <[email protected]>
    
    Closes #243 from myui/update_tokenize_ko_example.
---
 docs/gitbook/misc/tokenizer.md | 94 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 77 insertions(+), 17 deletions(-)

diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index 3992e2c..dccd1fb 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -235,20 +235,87 @@ See the following examples for the usage.
 select tokenize_ko();
 > 8.8.2
 
-select tokenize_ko('소설 무궁화꽃이 피었습니다.');
-> ["소설","무궁","화","꽃","피"]
+select tokenize_ko('중요한 새 기능을 개발해줘서 정말 고마워요!');
+> ["중요","기능","개발","주","고맙"]
+
+-- explicitly using default options
+select tokenize_ko('중요한 새 기능을 개발해줘서 정말 고마워요!', '-mode discard', 
+  -- stopwords (null to use default)
+  -- see 
https://github.com/apache/incubator-hivemall/blob/master/nlp/src/main/resources/hivemall/nlp/tokenizer/ext/stopwords-ko.txt
 
+  null, 
+  -- stoptags
+  -- see 
https://lucene.apache.org/core/8_8_2/analyzers-nori/org/apache/lucene/analysis/ko/POS.Tag.html
+  array(
+   'E',   -- Verbal endings
+   'IC',  -- Interjection
+   'J',   -- Ending Particle
+   'MAG', -- General Adverb
+   'MAJ', -- Conjunctive adverb
+   'MM',  -- Determiner
+   'SP',  -- Space 
+   'SSC', -- Closing brackets
+   'SSO', -- Opening brackets
+   'SC',  -- Separator
+   'SE',  -- Ellipsis
+   'XPN', -- Prefix
+   'XSA', -- Adjective Suffix
+   'XSN', -- Noun Suffix
+   'XSV', -- Verb Suffix
+   'UNA', -- Unknown
+   'NA',  -- Unknown
+   'VSV'  -- Unknown
+  )
+);
+> ["중요","기능","개발","주","고맙"]
+
+-- None mode, without General Adverb (MAG)
+select tokenize_ko('중요한 새 기능을 개발해줘서 정말 고마워요!', 
+  -- No decomposition for compound.
+  '-mode none', 
+  -- stopwords (null to use default)
+  null, 
+  array(
+   'E',   -- Verbal endings
+   'IC',  -- Interjection
+   'J',   -- Ending Particle
+   -- 'MAG', -- General Adverb
+   'MAJ', -- Conjunctive adverb
+   'MM',  -- Determiner
+   'SP',  -- Space 
+   'SSC', -- Closing brackets
+   'SSO', -- Opening brackets
+   'SC',  -- Separator
+   'SE',  -- Ellipsis
+   'XPN', -- Prefix
+   'XSA', -- Adjective Suffix
+   'XSN', -- Noun Suffix
+   'XSV', -- Verb Suffix
+   'UNA', -- Unknown
+   'NA',  -- Unknown
+   'VSV'  -- Unknown
+  )
+);
+> ["중요","기능","개발","줘서","정말","고마워요"]
+
+-- discard mode: Decompose compounds and discards the original form (default).
+-- 
https://lucene.apache.org/core/8_8_2/analyzers-nori/org/apache/lucene/analysis/ko/KoreanTokenizer.DecompoundMode.html
+select tokenize_ko('중요한 새 기능을 개발해줘서 정말 고마워요!', '-mode discard');
+> ["중요","기능","개발","주","고맙"]
 
-select tokenize_ko('소설 무궁화꽃이 피었습니다.', '-mode discard');
-> ["소설","무궁","화","꽃","피"]
+-- default stopward (null), with stoptags
+select tokenize_ko('중요한 새 기능을 개발해줘서 정말 고마워요!', '-mode discard', null, 
array('E', 'VV'));
+> ["중요","하","새","기능","을","개발","하","주","정말","고맙"]
 
-select tokenize_ko('소설 무궁화꽃이 피었습니다.', 'mixed');
-> ["소설","무궁화","무궁","화","꽃","피"]
+-- mixed mode: Decompose compounds and keeps the original form.
+select tokenize_ko('중요한 새 기능을 개발해줘서 정말 고마워요!', 'mixed');
+> ["중요","기능","개발","줘서","주","고마워요","고맙"]
 
-select tokenize_ko('소설 무궁화꽃이 피었습니다.', '-mode mixed');
-> ["소설","무궁화","무궁","화","꽃","피"]
+select tokenize_ko('중요한 새 기능을 개발해줘서 정말 고마워요!', '-mode mixed');
+> ["중요","기능","개발","줘서","주","고마워요","고맙"]
 
-select tokenize_ko('소설 무궁화꽃이 피었습니다.', '-mode none');
-> ["소설","무궁화","꽃","피"]
+-- node mode: No decomposition for compound.
+select tokenize_ko('중요한 새 기능을 개발해줘서 정말 고마워요!', '-mode none');
+> ["중요","기능","개발","줘서","고마워요"]
 
 select tokenize_ko('Hello, world.', '-mode none');
 > ["hello","world"]
@@ -256,13 +323,6 @@ select tokenize_ko('Hello, world.', '-mode none');
 select tokenize_ko('Hello, world.', '-mode none -outputUnknownUnigrams');
 > ["h","e","l","l","o","w","o","r","l","d"]
 
--- default stopward (null), with stoptags
-select tokenize_ko('소설 무궁화꽃이 피었습니다.', 'discard', null, array('E'));
-> ["소설","무궁","화","꽃","이","피"]
-
-select tokenize_ko('소설 무궁화꽃이 피었습니다.', 'discard', null, array('E', 'VV'));
-> ["소설","무궁","화","꽃","이"]
-
 select tokenize_ko('나는 C++ 언어를 프로그래밍 언어로 사랑한다.', '-mode discard');
 > ["나","c","언어","프로그래밍","언어","사랑"]
 

Reply via email to