Revision: 9051
http://languagetool.svn.sourceforge.net/languagetool/?rev=9051&view=rev
Author: jaumeortola
Date: 2013-01-18 14:55:21 +0000 (Fri, 18 Jan 2013)
Log Message:
-----------
[ca] Small changes in Catalan word tokenizer.
Modified Paths:
--------------
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanWordTokenizerTest.java
Modified:
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
===================================================================
---
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
2013-01-18 13:08:49 UTC (rev 9050)
+++
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
2013-01-18 14:55:21 UTC (rev 9051)
@@ -59,18 +59,18 @@
// Match verb+3 pronoms febles (rare but possible!). Ex:
Emporta-te'ls-hi.
// It creates 4 tokens:
<token>Emporta</token><token>-te</token><token>'ls</token><token>-hi</token>
- patterns[2] =
Pattern.compile("^([lnmtsd]')(.*)"+PF+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
- patterns[3] =
Pattern.compile("^(.*)"+PF+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+ patterns[2] =
Pattern.compile("^([lnmtsd]')(.{2,})"+PF+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+ patterns[3] =
Pattern.compile("^(.{2,})"+PF+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
// Match verb+2 pronoms febles. Ex: Emporta-te'ls.
// It creates 3 tokens:
<token>Emporta</token><token>-te</token><token>'ls</token>
- patterns[4] =
Pattern.compile("^([lnmtsd]')(.*)"+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
- patterns[5] =
Pattern.compile("^(.*)"+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+ patterns[4] =
Pattern.compile("^([lnmtsd]')(.{2,})"+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+ patterns[5] =
Pattern.compile("^(.{2,})"+PF+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
// match verb+1 pronom feble. Ex: Emporta't, vés-hi, porta'm.
// It creates 2 tokens: <token>Emporta</token><token>'t</token>
- patterns[6] =
Pattern.compile("^([lnmtsd]')(.*)"+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
- patterns[7] =
Pattern.compile("^(.*)"+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+ patterns[6] =
Pattern.compile("^([lnmtsd]')(.{2,})"+PF+"$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+ patterns[7] = Pattern.compile("^(.{2,})"+PF+"$",Pattern.UNICODE_CASE);
// d'emportar
patterns[8] =
Pattern.compile("^([lnmtsd]')(.*)$",Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
@@ -94,11 +94,14 @@
final List<String> l = new ArrayList<String>();
final StringTokenizer st = new StringTokenizer(
text.replaceAll("([\\p{L}])['’]([\\p{L}])",
"$1##CA_APOS##$2")
+ // Cases: d'1 km, és l'1 de
gener, és d'1.4 kg
+
.replaceAll("([dlDL])['’](1[\\s\\.,])", "$1##CA_APOS##$2")
//it's necessary for words like
"vint-i-quatre"
.replaceAll("([\\p{L}])-([\\p{L}])-([\\p{L}])",
"$1##CA_HYPHEN##$2##CA_HYPHEN##$3")
.replaceAll("([\\p{L}])-([\\p{L}\\d])", "$1##CA_HYPHEN##$2")
.replaceAll("([\\d])\\.([\\d])", "$1##CA_DECIMALPOINT##$2")
.replaceAll("([\\d]),([\\d])","$1##CA_DECIMALCOMMA##$2")
+ .replaceAll("([\\d])
([\\d])","$1##CA_SPACE##$2")
// allows correcting
typographical errors in "ela geminada"
.replaceAll("l\\.l",
"##ELA_GEMINADA##"),
"\u0020\u00A0\u115f\u1160\u1680"
@@ -118,6 +121,7 @@
.replaceAll("##CA_HYPHEN##", "-")
.replaceAll("##CA_DECIMALPOINT##", ".")
.replaceAll("##CA_DECIMALCOMMA##", ",")
+ .replaceAll("##CA_SPACE##", " ")
.replaceAll("##ELA_GEMINADA##", "l.l");
Matcher matcher = null;
boolean matchFound = false;
Modified:
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanWordTokenizerTest.java
===================================================================
---
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanWordTokenizerTest.java
2013-01-18 13:08:49 UTC (rev 9050)
+++
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanWordTokenizerTest.java
2013-01-18 14:55:21 UTC (rev 9051)
@@ -25,24 +25,38 @@
public class CatalanWordTokenizerTest extends TestCase {
- public void testTokenize() {
- CatalanWordTokenizer wordTokenizer = new CatalanWordTokenizer();
- List<String> tokens = wordTokenizer.tokenize("Emporta-te'ls a
l'observatori dels mars");
- assertEquals(tokens.size(), 13);
- assertEquals("[Emporta, -te, 'ls, , a, , l', observatori, , de, ls, ,
mars]",
- tokens.toString());
- tokens = wordTokenizer.tokenize("El tren Barcelona-València");
- assertEquals(tokens.size(), 7);
- assertEquals("[El, , tren, , Barcelona, -, València]",
- tokens.toString());
- tokens = wordTokenizer.tokenize("N'hi ha vint-i-quatre");
- assertEquals(tokens.size(), 6);
- assertEquals("[N', hi, , ha, , vint-i-quatre]",
- tokens.toString());
- tokens = wordTokenizer.tokenize("Mont-ras");
- assertEquals(tokens.size(), 1);
- assertEquals("[Mont-ras]",
- tokens.toString());
- }
-
+ public void testTokenize() {
+ CatalanWordTokenizer wordTokenizer = new CatalanWordTokenizer();
+ List<String> tokens = wordTokenizer
+ .tokenize("Emporta-te'ls a l'observatori dels
mars");
+ assertEquals(tokens.size(), 13);
+ assertEquals(
+ "[Emporta, -te, 'ls, , a, , l', observatori,
, de, ls, , mars]",
+ tokens.toString());
+ tokens = wordTokenizer.tokenize("El tren Barcelona-València");
+ assertEquals(tokens.size(), 7);
+ assertEquals("[El, , tren, , Barcelona, -, València]",
+ tokens.toString());
+ tokens = wordTokenizer.tokenize("N'hi ha vint-i-quatre");
+ assertEquals(tokens.size(), 6);
+ assertEquals("[N', hi, , ha, , vint-i-quatre]",
tokens.toString());
+ tokens = wordTokenizer.tokenize("Mont-ras");
+ assertEquals(tokens.size(), 1);
+ assertEquals("[Mont-ras]", tokens.toString());
+ tokens = wordTokenizer.tokenize("És d'1 km.");
+ assertEquals(tokens.size(), 7);
+ assertEquals("[És, , d', 1, , km, .]", tokens.toString());
+ tokens = wordTokenizer.tokenize("És d'1,5 km.");
+ assertEquals(tokens.size(), 7);
+ assertEquals("[És, , d', 1,5, , km, .]", tokens.toString());
+ tokens = wordTokenizer.tokenize("la direcció E-SE");
+ assertEquals(tokens.size(), 7);
+ assertEquals("[la, , direcció, , E, -, SE]",
tokens.toString());
+ tokens = wordTokenizer.tokenize("la direcció NW-SE");
+ assertEquals(tokens.size(), 7);
+ assertEquals("[la, , direcció, , NW, -, SE]",
tokens.toString());
+ tokens = wordTokenizer.tokenize("Se'n dóna vergonya");
+ assertEquals(tokens.size(), 6);
+ assertEquals("[Se, 'n, , dóna, , vergonya]",
tokens.toString());
+ }
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Master HTML5, CSS3, ASP.NET, MVC, AJAX, Knockout.js, Web API and
much more. Get web development skills now with LearnDevNow -
350+ hours of step-by-step video tutorials by Microsoft MVPs and experts.
SALE $99.99 this month only -- learn more at:
http://p.sf.net/sfu/learnmore_122812
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits