Revision: 7367
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=7367&view=rev
Author:   dnaber
Date:     2012-06-16 16:22:02 +0000 (Sat, 16 Jun 2012)
Log Message:
-----------
OOo/LO: fixed false alarm about word being uppercase when the previous sentence 
ended with a footnote - Sourceforge bug #3534637

Modified Paths:
--------------
    trunk/JLanguageTool/CHANGES.txt
    trunk/JLanguageTool/src/resource/segment.srx
    
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/SRXSentenceTokenizerTest.java

Modified: trunk/JLanguageTool/CHANGES.txt
===================================================================
--- trunk/JLanguageTool/CHANGES.txt     2012-06-16 14:30:27 UTC (rev 7366)
+++ trunk/JLanguageTool/CHANGES.txt     2012-06-16 16:22:02 UTC (rev 7367)
@@ -65,6 +65,9 @@
 
  -LibreOffice / OpenOffice integration: SingletonFactory now implements
   XServiceInfo (Stephan Bergmann) - Sourceforge bug #3526635
+
+ -LibreOffice / OpenOffice integration: Fixed false alarm about word being
+  uppercase when the previous sentence ended with a footnote - Sourceforge bug 
#3534637
   
  -LanguageTool supports now separate rules for different local variants of a 
language,
   for example American English and British English. To use them from the 
command line, 

Modified: trunk/JLanguageTool/src/resource/segment.srx
===================================================================
--- trunk/JLanguageTool/src/resource/segment.srx        2012-06-16 14:30:27 UTC 
(rev 7366)
+++ trunk/JLanguageTool/src/resource/segment.srx        2012-06-16 16:22:02 UTC 
(rev 7367)
@@ -4504,7 +4504,7 @@
 </rule>
 <!-- Break rules -->
 <rule break="yes">
-<beforebreak>[\.!?…]['|"|«|\)|\]|\}]?\s+</beforebreak>
+<beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}]?\s+</beforebreak>
 <afterbreak></afterbreak>
 </rule>
 <rule break="yes">
@@ -4657,7 +4657,7 @@
 </rule>
 <!-- Break rules -->
 <rule break="yes">
-<beforebreak>[\.!?…]['|"|«|\)|\]|\}]?\s+</beforebreak>
+<beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}]?\s+</beforebreak>
 <afterbreak></afterbreak>
 </rule>
 <rule break="yes">
@@ -4685,7 +4685,7 @@
 </rule>
 <!-- Break rules -->
 <rule break="yes">
-<beforebreak>[\.!?…]['|"|«|\)|\]|\}]?\s+</beforebreak>
+<beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}]?\s+</beforebreak>
 <afterbreak></afterbreak>
 </rule>
 <rule break="yes">

Modified: 
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/SRXSentenceTokenizerTest.java
===================================================================
--- 
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/SRXSentenceTokenizerTest.java
      2012-06-16 14:30:27 UTC (rev 7366)
+++ 
trunk/JLanguageTool/src/test/org/languagetool/tokenizers/SRXSentenceTokenizerTest.java
      2012-06-16 16:22:02 UTC (rev 7367)
@@ -19,6 +19,7 @@
 package org.languagetool.tokenizers;
 
 import junit.framework.TestCase;
+import org.languagetool.Language;
 import org.languagetool.TestTools;
 
 /**
@@ -30,7 +31,7 @@
   private SentenceTokenizer stokenizer = new SRXSentenceTokenizer("en");
   // accept only \n\n as paragraph:
   private SentenceTokenizer stokenizer2 = new SRXSentenceTokenizer("en");
-  
+
   public void setUp() {
     stokenizer.setSingleLineBreaksMarksParagraph(true);  
     stokenizer2.setSingleLineBreaksMarksParagraph(false);  
@@ -99,8 +100,26 @@
     testSplit(new String[] { "It works [really!]. ", "No doubt." });
     testSplit(new String[] { "It really(!) works well." });
     testSplit(new String[] { "It really[!] works well." });
+
+    testSplit(new String[] { "This is a sentence.\u0002 ", "And this is 
another one." });  // footnotes in LibOO/OOo look like this
   }
 
+  public void testOfficeFootnoteTokenize() {
+    for (Language language : Language.REAL_LANGUAGES) {
+      if (language.getSentenceTokenizer().getClass() != 
SRXSentenceTokenizer.class) {
+        continue;
+      }
+      if (language == Language.KHMER || language == Language.MALAYALAM || 
language.getShortName().equals("pt")) {
+        // TODO: I don't know about these...
+        continue;
+      }
+      final String input = "A sentence.\u0002 And another one.";
+      final SentenceTokenizer tokenizer = new 
SRXSentenceTokenizer(language.getShortName());
+      assertEquals("Sentence not split correctly for " + language + ": '" + 
input + "'",
+              "[A sentence.\u0002 , And another one.]", 
tokenizer.tokenize(input).toString());
+    }
+  }
+
   private void testSplit(String[] sentences) {
     TestTools.testSplit(sentences, stokenizer);
   }

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
Languagetool-cvs mailing list
Languagetool-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/languagetool-cvs

Reply via email to