This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 5b2455d Update tokenize_ko usage
5b2455d is described below
commit 5b2455dcfa39df384200e8a819d57cc4b9936aef
Author: Makoto Yui <[email protected]>
AuthorDate: Fri May 14 12:27:23 2021 +0900
Update tokenize_ko usage
---
userguide/misc/tokenizer.html | 96 +++++++++++++++++++++++++++++++++++--------
1 file changed, 78 insertions(+), 18 deletions(-)
diff --git a/userguide/misc/tokenizer.html b/userguide/misc/tokenizer.html
index bf1abf0..2c8e824 100644
--- a/userguide/misc/tokenizer.html
+++ b/userguide/misc/tokenizer.html
@@ -2568,20 +2568,87 @@ usage: tokenize_ko(String line [, const string mode =
"discard" (or co
select tokenize_ko();
> 8.8.2
-select tokenize_ko('소설
무궁화꽃이
피었습니다.');
->
["소설","무궁","화","꽃","피"]
+select tokenize_ko('중요한 새
기능을 개발해줘서
정말 고마워요!');
+>
["중요","기능","개발","주","고맙"]
+
+-- explicitly using default options
+select tokenize_ko('중요한 새
기능을 개발해줘서
정말 고마워요!', '-mode
discard',
+ -- stopwords (null to use default)
+ -- see
https://github.com/apache/incubator-hivemall/blob/master/nlp/src/main/resources/hivemall/nlp/tokenizer/ext/stopwords-ko.txt
+ null,
+ -- stoptags
+ -- see
https://lucene.apache.org/core/8_8_2/analyzers-nori/org/apache/lucene/analysis/ko/POS.Tag.html
+ array(
+ 'E', -- Verbal endings
+ 'IC', -- Interjection
+ 'J', -- Ending Particle
+ 'MAG', -- General Adverb
+ 'MAJ', -- Conjunctive adverb
+ 'MM', -- Determiner
+ 'SP', -- Space
+ 'SSC', -- Closing brackets
+ 'SSO', -- Opening brackets
+ 'SC', -- Separator
+ 'SE', -- Ellipsis
+ 'XPN', -- Prefix
+ 'XSA', -- Adjective Suffix
+ 'XSN', -- Noun Suffix
+ 'XSV', -- Verb Suffix
+ 'UNA', -- Unknown
+ 'NA', -- Unknown
+ 'VSV' -- Unknown
+ )
+);
+>
["중요","기능","개발","주","고맙"]
+
+-- None mode, without General Adverb (MAG)
+select tokenize_ko('중요한 새
기능을 개발해줘서
정말 고마워요!',
+ -- No decomposition for compound.
+ '-mode none',
+ -- stopwords (null to use default)
+ null,
+ array(
+ 'E', -- Verbal endings
+ 'IC', -- Interjection
+ 'J', -- Ending Particle
+ -- 'MAG', -- General Adverb
+ 'MAJ', -- Conjunctive adverb
+ 'MM', -- Determiner
+ 'SP', -- Space
+ 'SSC', -- Closing brackets
+ 'SSO', -- Opening brackets
+ 'SC', -- Separator
+ 'SE', -- Ellipsis
+ 'XPN', -- Prefix
+ 'XSA', -- Adjective Suffix
+ 'XSN', -- Noun Suffix
+ 'XSV', -- Verb Suffix
+ 'UNA', -- Unknown
+ 'NA', -- Unknown
+ 'VSV' -- Unknown
+ )
+);
+>
["중요","기능","개발","줘서","정말","고마워요"]
+
+-- discard mode: Decompose compounds and discards the original form (default).
+--
https://lucene.apache.org/core/8_8_2/analyzers-nori/org/apache/lucene/analysis/ko/KoreanTokenizer.DecompoundMode.html
+select tokenize_ko('중요한 새
기능을 개발해줘서
정말 고마워요!', '-mode
discard');
+>
["중요","기능","개발","주","고맙"]
-select tokenize_ko('소설
무궁화꽃이
피었습니다.', '-mode discard');
->
["소설","무궁","화","꽃","피"]
+-- default stopward (null), with stoptags
+select tokenize_ko('중요한 새
기능을 개발해줘서
정말 고마워요!', '-mode
discard', null, array('E', 'VV'));
+>
["중요","하","새","기능","을","개발","하","주","정말","고맙"]
-select tokenize_ko('소설
무궁화꽃이
피었습니다.', 'mixed');
->
["소설","무궁화","무궁","화","꽃","피"]
+-- mixed mode: Decompose compounds and keeps the original form.
+select tokenize_ko('중요한 새
기능을 개발해줘서
정말 고마워요!', 'mixed');
+>
["중요","기능","개발","줘서","주","고마워요","고맙"]
-select tokenize_ko('소설
무궁화꽃이
피었습니다.', '-mode mixed');
->
["소설","무궁화","무궁","화","꽃","피"]
+select tokenize_ko('중요한 새
기능을 개발해줘서
정말 고마워요!', '-mode
mixed');
+>
["중요","기능","개발","줘서","주","고마워요","고맙"]
-select tokenize_ko('소설
무궁화꽃이
피었습니다.', '-mode none');
->
["소설","무궁화","꽃","피"]
+-- node mode: No decomposition for compound.
+select tokenize_ko('중요한 새
기능을 개발해줘서
정말 고마워요!', '-mode
none');
+>
["중요","기능","개발","줘서","고마워요"]
select tokenize_ko('Hello, world.', '-mode none');
> ["hello","world"]
@@ -2589,13 +2656,6 @@ select tokenize_ko('Hello, world.',
'-mode none');
select tokenize_ko('Hello, world.', '-mode none
-outputUnknownUnigrams');
>
["h","e","l","l","o","w","o","r","l","d"]
--- default stopward (null), with stoptags
-select tokenize_ko('소설
무궁화꽃이
피었습니다.', 'discard', null,
array('E'));
->
["소설","무궁","화","꽃","이","피"]
-
-select tokenize_ko('소설
무궁화꽃이
피었습니다.', 'discard', null,
array('E', 'VV'));
->
["소설","무궁","화","꽃","이"]
-
select tokenize_ko('나는 C++ 언어를
프로그래밍 언어로
사랑한다.', '-mode discard');
>
["나","c","언어","프로그래밍","언어","사랑"]
@@ -2682,7 +2742,7 @@ Apache Hivemall is an effort undergoing incubation at The
Apache Software Founda
<script>
var gitbook = gitbook || [];
gitbook.push(function() {
- gitbook.page.hasChanged({"page":{"title":"Text
Tokenizer","level":"2.3","depth":1,"next":{"title":"Approximate Aggregate
Functions","level":"2.4","depth":1,"path":"misc/approx.md","ref":"misc/approx.md","articles":[]},"previous":{"title":"Efficient
Top-K Query
Processing","level":"2.2","depth":1,"path":"misc/topk.md","ref":"misc/topk.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","etoc","callouts","toggle-chapters","anchorjs",
[...]
+ gitbook.page.hasChanged({"page":{"title":"Text
Tokenizer","level":"2.3","depth":1,"next":{"title":"Approximate Aggregate
Functions","level":"2.4","depth":1,"path":"misc/approx.md","ref":"misc/approx.md","articles":[]},"previous":{"title":"Efficient
Top-K Query
Processing","level":"2.2","depth":1,"path":"misc/topk.md","ref":"misc/topk.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","etoc","callouts","toggle-chapters","anchorjs",
[...]
});
</script>
</div>