Revision: 8547
http://languagetool.svn.sourceforge.net/languagetool/?rev=8547&view=rev
Author: jaumeortola
Date: 2012-12-15 00:16:49 +0000 (Sat, 15 Dec 2012)
Log Message:
-----------
[ca] Change in CatalanWordTokenizer:
Hyphen is used (again) as a word separator, because this is very common in web
pages.
This way some errors cannot be found if the rule substituting hyphen for dash
is disabled.
Modified Paths:
--------------
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
Modified:
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
===================================================================
---
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
2012-12-14 23:02:14 UTC (rev 8546)
+++
trunk/JLanguageTool/src/main/java/org/languagetool/tokenizers/ca/CatalanWordTokenizer.java
2012-12-15 00:16:49 UTC (rev 8547)
@@ -80,13 +80,15 @@
/**
* @param text Text to tokenize
* @return List of tokens.
- * Note: a special string ##CA_APOS## is used to replace
apostrophes
- * during tokenizing (as in Dutch).
+ * Note: a special string ##CA_APOS## is used to replace
apostrophes,
+ * and ##CA_HYPHEN## to replace hyphens.
*/
@Override
public List<String> tokenize(final String text) {
final List<String> l = new ArrayList<String>();
- final StringTokenizer st = new
StringTokenizer(text.replaceAll("([\\p{L}])['’]([\\p{L}])", "$1##CA_APOS##$2"),
+ final StringTokenizer st = new
StringTokenizer(text.replaceAll("([\\p{L}])['’]([\\p{L}])", "$1##CA_APOS##$2")
+ .replaceAll("([\\p{L}])-([\\p{L}])-([\\p{L}])",
"$1##CA_HYPHEN##$2##CA_HYPHEN##$3") //it's necessary for words like
"vint-i-quatre"
+ .replaceAll("([\\p{L}])-([\\p{L}])",
"$1##CA_HYPHEN##$2"),
"\u0020\u00A0\u115f\u1160\u1680"
+
"\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+
"\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f"
@@ -94,12 +96,12 @@
+
"\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f"
+
"\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d"
+
"\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb"
- +
",.;()[]{}<>!?:/\\\"'«»„”“‘’`´…¿¡\t\n\r", true);
+ +
",.;()[]{}<>!?:/\\\"'«»„”“‘’`´…¿¡-\t\n\r", true);
String s;
String groupStr;
while (st.hasMoreElements()) {
- s=st.nextToken().replace("##CA_APOS##", "'");
+ s=st.nextToken().replaceAll("##CA_APOS##",
"'").replaceAll("##CA_HYPHEN##", "-");
Matcher matcher=null;
boolean matchFound=false;
int j=0;
Modified:
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
===================================================================
---
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
2012-12-14 23:02:14 UTC (rev 8546)
+++
trunk/JLanguageTool/src/main/resources/org/languagetool/rules/ca/grammar.xml
2012-12-15 00:16:49 UTC (rev 8547)
@@ -7381,7 +7381,7 @@
</pattern>
<message>Combinació incorrecta de pronoms febles.</message>
<short>Combinació incorrecta de pronoms.</short>
- <example type="incorrect">Portem
<marker>-los-els-los</marker>.</example>
+ <example
type="incorrect">Portem<marker>-los-els-los</marker>.</example>
<example type="correct">Portem-los-els.</example>
</rule>
<rule>
@@ -7731,7 +7731,7 @@
</pattern>
<message>Forma de pronom feble incorrecta.</message>
<short>Forma de pronom feble incorrecta.</short>
- <example type="incorrect">Portar
<marker>-els</marker>.</example>
+ <example
type="incorrect">Portar<marker>-els</marker>.</example>
<example type="correct">Portar-los.</example>
</rule>
<rule>
@@ -7773,7 +7773,7 @@
</pattern>
<message>Cal apostrofar: <suggestion><match no="2"
regexp_match="-[e]?([^e])e?" regexp_replace="-$1e"></match><match no="3"
regexp_match="-e?([^e]{1,2})e?"
regexp_replace="'$1"></match></suggestion>.</message>
<short>Cal apostrofar.</short>
- <example type="incorrect">Emporta
<marker>-te</marker>-el.</example>
+ <example
type="incorrect">Emporta<marker>-te</marker>-el.</example>
<example type="correct">Emporta-te'l.</example>
</rule>
<rule>
@@ -8523,51 +8523,55 @@
<example type="incorrect">Ara mateix<marker>...</marker></example>
<example type="correct">Ara mateix…</example>
</rule>
- <rulegroup id="GUIONET_SOLT" name="Substitueix un guionet solt per
guió llarg" default="off">
+ <rulegroup id="GUIONET_GUIO" name="Substitueix un guionet solt per
guió llarg" default="off">
<rule>
<pattern>
- <token spacebefore="yes" regexp="yes">-[^-]+<exception
inflected="yes">numeral</exception></token>
+ <marker>
+ <token spacebefore="yes">-</token>
+ </marker>
+ <token spacebefore="no" regexp="yes">[^-].*</token>
</pattern>
- <message>Cal substituir el guionet. <suggestion><match no="1"
regexp_match=".(.+)" regexp_replace="—$1"></match></suggestion>.</message>
+ <message>Cal substituir el guionet per guió:
<suggestion>—</suggestion>.</message>
<short>Cal substituir el guionet</short>
- <example type="incorrect">Vine
<marker>-digué</marker>.</example>
- <example type="correct">Vine <marker>—digué</marker>.</example>
+ <example type="incorrect">Vine
<marker>-</marker>digué.</example>
+ <example type="correct">Vine —digué.</example>
</rule>
<rule>
<pattern>
+ <token regexp="yes">.*[^-]</token>
<marker>
- <token regexp="yes">[^-]+-</token>
+ <token spacebefore="no">-</token>
</marker>
</pattern>
- <message>Cal substituir el guionet. <suggestion><match no="1"
regexp_match="(.+)." regexp_replace="$1—"></match></suggestion>.</message>
+ <message>Cal substituir el guionet per guió:
<suggestion>—</suggestion>.</message>
<short>Cal substituir el guionet</short>
- <example type="incorrect"><marker>digué-</marker>
vine.</example>
- <example type="correct"><marker> digué—</marker> vine</example>
+ <example type="incorrect">digué<marker>-</marker>
vine.</example>
+ <example type="correct">digué— vine</example>
</rule>
<rule>
<pattern>
+ <token regexp="yes">.*[^-]</token>
<marker>
- <token regexp="yes">[^-]+-</token>
+ <token spacebefore="no">-</token>
</marker>
<token regexp="yes">;|,|\.</token>
</pattern>
- <message>Cal substituir el guionet. <suggestion><match no="1"
regexp_match="(.+)." regexp_replace="$1—"></match></suggestion>.</message>
+ <message>Cal substituir el guionet per guió:
<suggestion>—</suggestion>.</message>
<short>Cal substituir el guionet</short>
- <example type="incorrect"><marker>digué-</marker>,
vine.</example>
- <example type="correct"><marker> digué—</marker>,
vine</example>
+ <example type="incorrect">digué<marker>-</marker>,
vine.</example>
+ <example type="correct">digué—, vine</example>
</rule>
<rule>
<pattern>
<marker>
- <token regexp="yes">[-]+</token>
+ <token>-</token>
+ <token>-</token>
</marker>
</pattern>
- <message>Cal substituir els guionets.
<suggestion>—</suggestion>.</message>
+ <message>Cal substituir els guionets per guió:
<suggestion>—</suggestion>.</message>
<short>Cal substituir els guionets</short>
- <example type="incorrect"><marker>--</marker>
- </example>
- <example type="correct"><marker>—</marker>
- </example>
+ <example type="incorrect"><marker>--</marker></example>
+ <example type="correct"><marker>—</marker></example>
</rule>
</rulegroup>
<!-- <rulegroup id="apostrof_cometes" name="Apòstrof i cometes">
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
LogMeIn Rescue: Anywhere, Anytime Remote support for IT. Free Trial
Remotely access PCs and mobile devices and provide instant support
Improve your efficiency, and focus on delivering more value-add services
Discover what IT Professionals Know. Rescue delivers
http://p.sf.net/sfu/logmein_12329d2d
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits