OPENNLP-1047: Add detokenizer and sent detect abbreviations for Irish Closes #188
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/caeaaeea Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/caeaaeea Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/caeaaeea Branch: refs/heads/LangDetect Commit: caeaaeea61e88fe4222b997b2dad49728b91ba68 Parents: 3df659b Author: Jim O'Regan <jaore...@tcd.ie> Authored: Sat Apr 29 00:06:42 2017 +0100 Committer: Jörn Kottmann <jo...@apache.org> Committed: Wed May 3 12:05:16 2017 +0200 ---------------------------------------------------------------------- opennlp-tools/lang/ga/sentdetect/abb.xml | 164 +++++++++++++++++++ .../lang/ga/tokenizer/ga-detokenizer.xml | 113 +++++++++++++ 2 files changed, 277 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/caeaaeea/opennlp-tools/lang/ga/sentdetect/abb.xml ---------------------------------------------------------------------- diff --git a/opennlp-tools/lang/ga/sentdetect/abb.xml b/opennlp-tools/lang/ga/sentdetect/abb.xml new file mode 100644 index 0000000..9d15aed --- /dev/null +++ b/opennlp-tools/lang/ga/sentdetect/abb.xml @@ -0,0 +1,164 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<dictionary case_sensitive="false"> +<entry> +<token>tel.</token> +</entry> +<entry> +<token>Mr.</token> +</entry> +<entry> +<token>Mrs.</token> +</entry> +<entry> +<token>.i.</token> +</entry> +<entry> +<token>Uacht.</token> +</entry> +<entry> +<token>m.sh.</token> +</entry> +<entry> +<token>lch.</token> +</entry> +<entry> +<token>lgh.</token> +</entry> +<entry> +<token>Dr.</token> +</entry> +<entry> +<token>uimh.</token> +</entry> +<entry> +<token>Co.</token> +</entry> +<entry> +<token>gCo.</token> +</entry> +<entry> +<token>tUacht.</token> +</entry> +<entry> +<token>Uas.</token> +</entry> +<entry> +<token>tUas.</token> +</entry> +<entry> +<token>Msc.</token> +</entry> +<entry> +<token>Ms.</token> +</entry> +<entry> +<token>Sr.</token> +</entry> +<entry> +<token>Jr.</token> +</entry> +<entry> +<token>Bros.</token> +</entry> +<entry> +<token>fig.</token> +</entry> +<entry> +<token>Jan.</token> +</entry> +<entry> +<token>Feb.</token> +</entry> +<entry> +<token>Mar.</token> +</entry> +<entry> +<token>Apr.</token> +</entry> +<entry> +<token>Jun.</token> +</entry> +<entry> +<token>Jul.</token> +</entry> +<entry> +<token>Aug.</token> +</entry> +<entry> +<token>Sep.</token> +</entry> +<entry> +<token>Sept.</token> +</entry> +<entry> +<token>Oct.</token> +</entry> +<entry> +<token>Nov.</token> +</entry> +<entry> +<token>Dec.</token> +</entry> +<entry> +<token>Ean.</token> +</entry> +<entry> +<token>Fea.</token> +</entry> +<entry> +<token>Már.</token> +</entry> +<entry> +<token>Aib.</token> +</entry> +<entry> +<token>Bea.</token> +</entry> +<entry> +<token>Mei.</token> +</entry> +<entry> +<token>Iúl.</token> +</entry> +<entry> +<token>Lún.</token> +</entry> +<entry> +<token>M.Fr.</token> +</entry> +<entry> +<token>D.Fr.</token> +</entry> +<entry> +<token>Sam.</token> +</entry> +<entry> +<token>Nol.</token> +</entry> +<entry> +<token>Ltd.</token> +</entry> +<entry> +<token>Teo.</token> +</entry> +</dictionary> http://git-wip-us.apache.org/repos/asf/opennlp/blob/caeaaeea/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml ---------------------------------------------------------------------- diff --git a/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml b/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml new file mode 100644 index 0000000..23fe96a --- /dev/null +++ b/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml @@ -0,0 +1,113 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<dictionary> + <entry operation="RIGHT_LEFT_MATCHING"> + <token>"</token> + </entry> + <entry operation="RIGHT_LEFT_MATCHING"> + <token>'</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>.</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>?</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>!</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>,</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>;</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>:</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>(</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>)</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>}</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>{</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>]</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>[</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>»</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>«</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>``</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>''</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>%</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>.org</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>.com</token> + </entry> + <entry operation="MOVE_LEFT"> + <token>.net</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>#</token> + </entry> + <entry operation="MOVE_BOTH"> + <token>-</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>m'</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>d'</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>b'</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>mb'</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>dh'</token> + </entry> + <entry operation="MOVE_RIGHT"> + <token>lem'</token> + </entry> +</dictionary>