Revision: 8549
http://languagetool.svn.sourceforge.net/languagetool/?rev=8549&view=rev
Author: jaumeortola
Date: 2012-12-15 00:59:08 +0000 (Sat, 15 Dec 2012)
Log Message:
-----------
[ca] Tokenize correctly two proper names united by a hyphen. Ex.: El tren
Barcelona-Val?\195?\168ncia.
Modified Paths:
--------------
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanWordTokenizerTest.java
Modified:
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
===================================================================
---
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
2012-12-15 00:31:15 UTC (rev 8548)
+++
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
2012-12-15 00:59:08 UTC (rev 8549)
@@ -88,7 +88,7 @@
final List<String> l = new ArrayList<String>();
final StringTokenizer st = new
StringTokenizer(text.replaceAll("([\\p{L}])['’]([\\p{L}])", "$1##CA_APOS##$2")
.replaceAll("([\\p{L}])-([\\p{L}])-([\\p{L}])",
"$1##CA_HYPHEN##$2##CA_HYPHEN##$3") //it's necessary for words like
"vint-i-quatre"
- .replaceAll("([\\p{L}])-([\\p{L}])",
"$1##CA_HYPHEN##$2"),
+ .replaceAll("([\\p{Ll}])-([\\p{Ll}])",
"$1##CA_HYPHEN##$2"),
"\u0020\u00A0\u115f\u1160\u1680"
+
"\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+
"\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f"
Modified:
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanWordTokenizerTest.java
===================================================================
---
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanWordTokenizerTest.java
2012-12-15 00:31:15 UTC (rev 8548)
+++
trunk/JLanguageTool/src/test/java/org/languagetool/tokenizers/ca/CatalanWordTokenizerTest.java
2012-12-15 00:59:08 UTC (rev 8549)
@@ -31,6 +31,14 @@
assertEquals(tokens.size(), 13);
assertEquals("[Emporta, -te, 'ls, , a, , l', observatori, , de, ls, ,
mars]",
tokens.toString());
+ tokens = wordTokenizer.tokenize("El tren Barcelona-València");
+ assertEquals(tokens.size(), 7);
+ assertEquals("[El, , tren, , Barcelona, -, València]",
+ tokens.toString());
+ tokens = wordTokenizer.tokenize("N'hi ha vint-i-quatre");
+ assertEquals(tokens.size(), 6);
+ assertEquals("[N', hi, , ha, , vint-i-quatre]",
+ tokens.toString());
}
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
LogMeIn Rescue: Anywhere, Anytime Remote support for IT. Free Trial
Remotely access PCs and mobile devices and provide instant support
Improve your efficiency, and focus on delivering more value-add services
Discover what IT Professionals Know. Rescue delivers
http://p.sf.net/sfu/logmein_12329d2d
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits