CatalanWordTokenizer.java

milek_pl Tue, 26 Jun 2012 04:07:35 -0700

Revision: 7513
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=7513&view=rev
Author:   milek_pl
Date:     2012-06-26 11:07:23 +0000 (Tue, 26 Jun 2012)
Log Message:
-----------
patterns should not be compiled on every tokenization call!


Modified Paths:
--------------
    
trunk/JLanguageTool/src/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java

Modified: 
trunk/JLanguageTool/src/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
===================================================================
--- 
trunk/JLanguageTool/src/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
       2012-06-26 10:51:19 UTC (rev 7512)
+++ 
trunk/JLanguageTool/src/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
       2012-06-26 11:07:23 UTC (rev 7513)
@@ -38,8 +38,43 @@
        //all possible forms of "pronoms febles" after a verb.
        private static final String PF = 
"('en|'hi|'ho|'l|'ls|'m|'n|'ns|'s|'t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)";
 
+    private int maxPatterns = 11;
+    private Pattern[] patterns = new Pattern[maxPatterns];
 
        public CatalanWordTokenizer() {
+
+        // Apostrophe at the beginning of a word. Ex.: l'home, s'estima, 
n'omple, hivern, etc.
+        // It creates 2 tokens: <token>l'</token><token>home</token>
+        patterns[0] = 
Pattern.compile("^([lnmtsd]')([^'\\-]*)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+
+        // Exceptions to (Match verb+1 pronom feble)
+        // It creates 1 token: <token>qui-sap-lo</token>
+        patterns[1] = 
Pattern.compile("^(qui-sap-lo|qui-sap-la|qui-sap-los|qui-sap-les)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+
+        // Match verb+3 pronoms febles (rare but possible!). Ex: 
Emporta-te'ls-hi.
+        // It creates 4 tokens: 
<token>Emporta</token><token>-te</token><token>'ls</token><token>-hi</token>
+        patterns[2] = 
Pattern.compile("^([lnmtsd]')(.*)"+PF+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+        patterns[3] = 
Pattern.compile("^(.*)"+PF+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+
+        // Match verb+2 pronoms febles. Ex: Emporta-te'ls. 
+        // It creates 3 tokens: 
<token>Emporta</token><token>-te</token><token>'ls</token>
+        patterns[4] = 
Pattern.compile("^([lnmtsd]')(.*)"+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+        patterns[5] = 
Pattern.compile("^(.*)"+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+
+        // match verb+1 pronom feble. Ex: Emporta't, vés-hi, porta'm.
+        // It creates 2 tokens: <token>Emporta</token><token>'t</token>
+        patterns[6] = 
Pattern.compile("^([lnmtsd]')(.*)"+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+        patterns[7] = 
Pattern.compile("^(.*)"+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+
+        // d'emportar
+        patterns[8] = 
Pattern.compile("^([lnmtsd]')(.*)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+
+        //contractions: al, als, pel, pels, del, dels, cal (!), cals (!) 
+        patterns[9] = 
Pattern.compile("^(a|de|pe)(ls?)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+
+        //contraction: can
+        patterns[10] = 
Pattern.compile("^(ca)(n)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+
        }
 
        /**
@@ -62,42 +97,7 @@
                                                + 
",.;()[]{}<>!?:/\\\"'«»„”“‘’`´…¿¡\t\n\r", true);
                String s;
                String groupStr;
-
-               int maxPatterns = 11;
-               Pattern[] patterns = new Pattern[maxPatterns];
-
-               // Apostrophe at the beginning of a word. Ex.: l'home, 
s'estima, n'omple, hivern, etc.
-               // It creates 2 tokens: <token>l'</token><token>home</token>
-               patterns[0] = 
Pattern.compile("^([lnmtsd]')([^'\\-]*)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-
-               // Exceptions to (Match verb+1 pronom feble)
-               // It creates 1 token: <token>qui-sap-lo</token>
-               patterns[1] = 
Pattern.compile("^(qui-sap-lo|qui-sap-la|qui-sap-los|qui-sap-les)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-
-               // Match verb+3 pronoms febles (rare but possible!). Ex: 
Emporta-te'ls-hi.
-               // It creates 4 tokens: 
<token>Emporta</token><token>-te</token><token>'ls</token><token>-hi</token>
-               patterns[2] = 
Pattern.compile("^([lnmtsd]')(.*)"+PF+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-               patterns[3] = 
Pattern.compile("^(.*)"+PF+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-
-               // Match verb+2 pronoms febles. Ex: Emporta-te'ls. 
-               // It creates 3 tokens: 
<token>Emporta</token><token>-te</token><token>'ls</token>
-               patterns[4] = 
Pattern.compile("^([lnmtsd]')(.*)"+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-               patterns[5] = 
Pattern.compile("^(.*)"+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-
-               // match verb+1 pronom feble. Ex: Emporta't, vés-hi, porta'm.
-               // It creates 2 tokens: <token>Emporta</token><token>'t</token>
-               patterns[6] = 
Pattern.compile("^([lnmtsd]')(.*)"+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-               patterns[7] = 
Pattern.compile("^(.*)"+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-
-               // d'emportar
-               patterns[8] = 
Pattern.compile("^([lnmtsd]')(.*)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-
-               //contractions: al, als, pel, pels, del, dels, cal (!), cals 
(!) 
-               patterns[9] = 
Pattern.compile("^(a|de|pe)(ls?)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-
-               //contraction: can
-               patterns[10] = 
Pattern.compile("^(ca)(n)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
-
+               
                while (st.hasMoreElements()) {
                        s=st.nextToken().replace("##CA_APOS##", "'");
                        Matcher matcher=null;

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs

[LanguageTool] SF.net SVN: languagetool:[7513] trunk/JLanguageTool/src/java/org/ languagetool/tokenizers/ca/CatalanWordTokenizer.java

Reply via email to