Revision: 7367 http://languagetool.svn.sourceforge.net/languagetool/?rev=7367&view=rev Author: dnaber Date: 2012-06-16 16:22:02 +0000 (Sat, 16 Jun 2012) Log Message: ----------- OOo/LO: fixed false alarm about word being uppercase when the previous sentence ended with a footnote - Sourceforge bug #3534637
Modified Paths: -------------- trunk/JLanguageTool/CHANGES.txt trunk/JLanguageTool/src/resource/segment.srx trunk/JLanguageTool/src/test/org/languagetool/tokenizers/SRXSentenceTokenizerTest.java Modified: trunk/JLanguageTool/CHANGES.txt =================================================================== --- trunk/JLanguageTool/CHANGES.txt 2012-06-16 14:30:27 UTC (rev 7366) +++ trunk/JLanguageTool/CHANGES.txt 2012-06-16 16:22:02 UTC (rev 7367) @@ -65,6 +65,9 @@ -LibreOffice / OpenOffice integration: SingletonFactory now implements XServiceInfo (Stephan Bergmann) - Sourceforge bug #3526635 + + -LibreOffice / OpenOffice integration: Fixed false alarm about word being + uppercase when the previous sentence ended with a footnote - Sourceforge bug #3534637 -LanguageTool supports now separate rules for different local variants of a language, for example American English and British English. To use them from the command line, Modified: trunk/JLanguageTool/src/resource/segment.srx =================================================================== --- trunk/JLanguageTool/src/resource/segment.srx 2012-06-16 14:30:27 UTC (rev 7366) +++ trunk/JLanguageTool/src/resource/segment.srx 2012-06-16 16:22:02 UTC (rev 7367) @@ -4504,7 +4504,7 @@ </rule> <!-- Break rules --> <rule break="yes"> -<beforebreak>[\.!?…]['|"|«|\)|\]|\}]?\s+</beforebreak> +<beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}]?\s+</beforebreak> <afterbreak></afterbreak> </rule> <rule break="yes"> @@ -4657,7 +4657,7 @@ </rule> <!-- Break rules --> <rule break="yes"> -<beforebreak>[\.!?…]['|"|«|\)|\]|\}]?\s+</beforebreak> +<beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}]?\s+</beforebreak> <afterbreak></afterbreak> </rule> <rule break="yes"> @@ -4685,7 +4685,7 @@ </rule> <!-- Break rules --> <rule break="yes"> -<beforebreak>[\.!?…]['|"|«|\)|\]|\}]?\s+</beforebreak> +<beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}]?\s+</beforebreak> <afterbreak></afterbreak> </rule> <rule break="yes"> Modified: trunk/JLanguageTool/src/test/org/languagetool/tokenizers/SRXSentenceTokenizerTest.java =================================================================== --- trunk/JLanguageTool/src/test/org/languagetool/tokenizers/SRXSentenceTokenizerTest.java 2012-06-16 14:30:27 UTC (rev 7366) +++ trunk/JLanguageTool/src/test/org/languagetool/tokenizers/SRXSentenceTokenizerTest.java 2012-06-16 16:22:02 UTC (rev 7367) @@ -19,6 +19,7 @@ package org.languagetool.tokenizers; import junit.framework.TestCase; +import org.languagetool.Language; import org.languagetool.TestTools; /** @@ -30,7 +31,7 @@ private SentenceTokenizer stokenizer = new SRXSentenceTokenizer("en"); // accept only \n\n as paragraph: private SentenceTokenizer stokenizer2 = new SRXSentenceTokenizer("en"); - + public void setUp() { stokenizer.setSingleLineBreaksMarksParagraph(true); stokenizer2.setSingleLineBreaksMarksParagraph(false); @@ -99,8 +100,26 @@ testSplit(new String[] { "It works [really!]. ", "No doubt." }); testSplit(new String[] { "It really(!) works well." }); testSplit(new String[] { "It really[!] works well." }); + + testSplit(new String[] { "This is a sentence.\u0002 ", "And this is another one." }); // footnotes in LibOO/OOo look like this } + public void testOfficeFootnoteTokenize() { + for (Language language : Language.REAL_LANGUAGES) { + if (language.getSentenceTokenizer().getClass() != SRXSentenceTokenizer.class) { + continue; + } + if (language == Language.KHMER || language == Language.MALAYALAM || language.getShortName().equals("pt")) { + // TODO: I don't know about these... + continue; + } + final String input = "A sentence.\u0002 And another one."; + final SentenceTokenizer tokenizer = new SRXSentenceTokenizer(language.getShortName()); + assertEquals("Sentence not split correctly for " + language + ": '" + input + "'", + "[A sentence.\u0002 , And another one.]", tokenizer.tokenize(input).toString()); + } + } + private void testSplit(String[] sentences) { TestTools.testSplit(sentences, stokenizer); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Live Security Virtual Conference Exclusive live event will cover all the ways today's security and threat landscape has changed and how IT managers can respond. Discussions will include endpoint security, mobile security and the latest in malware threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/ _______________________________________________ Languagetool-cvs mailing list Languagetool-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/languagetool-cvs