Revision: 9402
          
http://languagetool.svn.sourceforge.net/languagetool/?rev=9402&view=rev
Author:   dnaber
Date:     2013-02-17 13:48:14 +0000 (Sun, 17 Feb 2013)
Log Message:
-----------
remove Bliki, all Wikipedia text extraction is now done with Sweble

Modified Paths:
--------------
    trunk/languagetool/languagetool-wikipedia/pom.xml
    
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
    
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextFilterTools.java
    
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
    
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaTextFilterTest.java

Removed Paths:
-------------
    
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BlikiWikipediaTextFilter.java

Modified: trunk/languagetool/languagetool-wikipedia/pom.xml
===================================================================
--- trunk/languagetool/languagetool-wikipedia/pom.xml   2013-02-17 09:53:18 UTC 
(rev 9401)
+++ trunk/languagetool/languagetool-wikipedia/pom.xml   2013-02-17 13:48:14 UTC 
(rev 9402)
@@ -122,11 +122,6 @@
             <version>1.1.0</version>
         </dependency>
         <dependency>
-            <groupId>info.bliki.wiki</groupId>
-            <artifactId>bliki-core</artifactId>
-            <version>3.0.19</version>
-        </dependency>
-        <dependency>
             <groupId>org.apache.lucene</groupId>
             <artifactId>lucene-queries</artifactId>
             <version>4.0.0</version>
@@ -143,17 +138,4 @@
         </dependency>
     </dependencies>
 
-    <repositories>
-        <repository>
-            <id>info-bliki-repository</id>
-            <url>http://gwtwiki.googlecode.com/svn/maven-repository/</url>
-            <releases>
-                <enabled>true</enabled>
-            </releases>
-            <snapshots>
-                <enabled>false</enabled>
-            </snapshots>
-        </repository>
-    </repositories>
-    
 </project>
\ No newline at end of file

Modified: 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
        2013-02-17 09:53:18 UTC (rev 9401)
+++ 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
        2013-02-17 13:48:14 UTC (rev 9402)
@@ -56,7 +56,7 @@
   private StringBuilder text = new StringBuilder();
   private String title;
 
-  private TextFilter textFilter = new BlikiWikipediaTextFilter();
+  private TextFilter textFilter = new SwebleWikipediaTextFilter();
 
   //===========================================================
   // SAX DocumentHandler methods

Deleted: 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BlikiWikipediaTextFilter.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BlikiWikipediaTextFilter.java
        2013-02-17 09:53:18 UTC (rev 9401)
+++ 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BlikiWikipediaTextFilter.java
        2013-02-17 13:48:14 UTC (rev 9402)
@@ -1,55 +0,0 @@
-/* LanguageTool, a natural language style checker 
- * Copyright (C) 2010 Daniel Naber (http://www.danielnaber.de)
- * 
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
- * USA
- */
-package org.languagetool.dev.wikipedia;
-
-import info.bliki.wiki.model.WikiModel;
-
-import org.apache.commons.lang.StringEscapeUtils;
-import org.languagetool.TextFilter;
-
-/**
- * Convert Wikipedia syntax to HTML using Bliki and then try to clean it up 
(this is
- * rather ugly).
- */
-class BlikiWikipediaTextFilter implements TextFilter {
-
-  public String filter(String s) {
-    // TODO: find general HTML to Text converter?!:
-    final WikiModel wikiModel = new WikiModel("${image}", "${title}");
-    // image with link:
-    s = s.replaceAll("\\[\\[Datei:.*?\\[\\[.*?\\]\\].*?\\]\\]", "");
-    // image without link:
-    s = s.replaceAll("\\[\\[Datei:.*?\\]\\]", "");
-    s = wikiModel.render(s);
-    //System.out.println("0####"+s);
-    s = s.replaceAll("\\{\\{.*?\\}\\}", "");
-    s = s.replaceAll("</p>", "\n\n");
-    s = s.replaceAll("</dt>", "\n\n");
-    s = s.replaceAll("</dl>", "\n\n");
-    s = s.replaceAll("</h\\d>", "\n\n");
-    s = s.replaceAll("<a 
href=\"http://[a-zA-Z-]+\\.wikipedia\\.org/wiki/.*?\";>.*?</a>", "");
-    s = s.replaceAll("<.*?>", "");
-    s = s.replaceAll("\n\n*", "\n\n");    // single line break isn't detected 
as paragraph in LT by default
-    s = StringEscapeUtils.unescapeHtml(s);
-    //System.out.println("1############################################\n"+s);
-    //System.out.println("/############################################"+s);
-    return s.trim();
-  }
-
-}

Modified: 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextFilterTools.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextFilterTools.java
 2013-02-17 09:53:18 UTC (rev 9401)
+++ 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextFilterTools.java
 2013-02-17 13:48:14 UTC (rev 9402)
@@ -19,6 +19,7 @@
 package org.languagetool.dev.wikipedia;
 
 import org.languagetool.Language;
+import org.languagetool.TextFilter;
 
 /**
  * Helper class.
@@ -28,10 +29,10 @@
   private TextFilterTools() {
   }
 
-  static BlikiWikipediaTextFilter getTextFilter(Language lang) {
-    final BlikiWikipediaTextFilter textFilter;
+  static TextFilter getTextFilter(Language lang) {
+    final SwebleWikipediaTextFilter textFilter;
     if (lang.getShortName().equals("ro")) {
-      textFilter = new BlikiWikipediaTextFilter() {
+      textFilter = new SwebleWikipediaTextFilter() {
         @Override
         public String filter(String arg0) {
           final String tmp = super.filter(arg0);
@@ -40,7 +41,7 @@
         }
       };
     } else {
-      textFilter = new BlikiWikipediaTextFilter();
+      textFilter = new SwebleWikipediaTextFilter();
     }
     return textFilter;
   }

Modified: 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
   2013-02-17 09:53:18 UTC (rev 9401)
+++ 
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
   2013-02-17 13:48:14 UTC (rev 9402)
@@ -38,7 +38,6 @@
 import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * 
  * Wikipedia handler for indexing. See {@link 
org.languagetool.dev.index.Searcher} for a
  * class that lets you use this index.
  * 
@@ -62,7 +61,7 @@
   private boolean inTitle = false;
   private StringBuilder text = new StringBuilder();
   private StringBuilder title = new StringBuilder();
-  private TextFilter textFilter = new BlikiWikipediaTextFilter();
+  private TextFilter textFilter = new SwebleWikipediaTextFilter();
 
   // ===========================================================
   // SAX DocumentHandler methods
@@ -73,7 +72,7 @@
     this.start = start;
     this.end = end;
     if (start > end && end != 0) {
-      throw new RuntimeException("\"start\" should be smaller than \"end\"");
+      throw new RuntimeException("\"start\" should be smaller than \"end\": " 
+ start + ", " + end);
     }
     textFilter = TextFilterTools.getTextFilter(language);
   }

Modified: 
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaTextFilterTest.java
===================================================================
--- 
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaTextFilterTest.java
 2013-02-17 09:53:18 UTC (rev 9401)
+++ 
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaTextFilterTest.java
 2013-02-17 13:48:14 UTC (rev 9402)
@@ -22,18 +22,15 @@
 
 public class WikipediaTextFilterTest extends TestCase {
 
-  final BlikiWikipediaTextFilter blikiFilter = new BlikiWikipediaTextFilter();
   final SwebleWikipediaTextFilter swebleFilter = new 
SwebleWikipediaTextFilter();
   
   public void testImageRemoval() throws Exception {
     final String input = "foo [[Datei:Bundesarchiv Bild 
183-1990-0803-017.jpg|miniatur|Mit Lothar de Maizière im August 1990]] bar";
-    assertEquals("foo  bar", blikiFilter.filter(input));
     assertEquals("foo bar", swebleFilter.filter(input));
   }
   
   public void testRemovalOfImageWithLink() throws Exception {
     final String input = "foo [[Datei:Bundesarchiv Bild 
183-1990-0803-017.jpg|miniatur|Mit [[Lothar de Maizière]] im August 1990]] bar 
[[Link]]";
-    assertEquals("foo  bar Link", blikiFilter.filter(input));
     assertEquals("foo bar Link", swebleFilter.filter(input));
   }
 

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
The Go Parallel Website, sponsored by Intel - in partnership with Geeknet, 
is your hub for all things parallel software development, from weekly thought 
leadership blogs to news, videos, case studies, tutorials, tech docs, 
whitepapers, evaluation guides, and opinion stories. Check out the most 
recent posts - join the conversation now. http://goparallel.sourceforge.net/
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits

Reply via email to