Revision: 7862 http://languagetool.svn.sourceforge.net/languagetool/?rev=7862&view=rev Author: dominikoeo Date: 2012-08-14 18:45:55 +0000 (Tue, 14 Aug 2012) Log Message: ----------- - common word tokenizer specified ` (U+0060) twice - tokenizer English words with * ` and |.
Modified Paths: -------------- trunk/JLanguageTool/src/java/org/languagetool/tokenizers/WordTokenizer.java trunk/JLanguageTool/src/java/org/languagetool/tokenizers/en/EnglishWordTokenizer.java Modified: trunk/JLanguageTool/src/java/org/languagetool/tokenizers/WordTokenizer.java =================================================================== --- trunk/JLanguageTool/src/java/org/languagetool/tokenizers/WordTokenizer.java 2012-08-13 22:37:17 UTC (rev 7861) +++ trunk/JLanguageTool/src/java/org/languagetool/tokenizers/WordTokenizer.java 2012-08-14 18:45:55 UTC (rev 7862) @@ -37,13 +37,13 @@ public List<String> tokenize(final String text) { final List<String> l = new ArrayList<String>(); final StringTokenizer st = new StringTokenizer(text, - "\u0020\u0060\u007c\u00A0\u115f\u1160\u1680" + "\u0020\u00A0\u115f\u1160\u1680" + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007" + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f" + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f" + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d" + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb" - + ",.;()[]{}<>!?:/\\\"'«»„”“`´‘’‛′…¿¡\t\n\r", true); + + ",.;()[]{}<>!?:/|\\\"'«»„”“`´‘’‛′…¿¡\t\n\r", true); while (st.hasMoreElements()) { l.add(st.nextToken()); } Modified: trunk/JLanguageTool/src/java/org/languagetool/tokenizers/en/EnglishWordTokenizer.java =================================================================== --- trunk/JLanguageTool/src/java/org/languagetool/tokenizers/en/EnglishWordTokenizer.java 2012-08-13 22:37:17 UTC (rev 7861) +++ trunk/JLanguageTool/src/java/org/languagetool/tokenizers/en/EnglishWordTokenizer.java 2012-08-14 18:45:55 UTC (rev 7862) @@ -44,7 +44,7 @@ + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f" + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d" + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb" - + "—,.;()[]{}!?:\"'’‘„“”…\\/\t\n", true); + + "—,.;()[]{}!?:|*\"'’‘`„“”…\\/\t\n", true); while (st.hasMoreElements()) { tokens.add(st.nextToken()); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Live Security Virtual Conference Exclusive live event will cover all the ways today's security and threat landscape has changed and how IT managers can respond. Discussions will include endpoint security, mobile security and the latest in malware threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/ _______________________________________________ Languagetool-cvs mailing list Languagetool-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/languagetool-cvs