Revision: 9402
http://languagetool.svn.sourceforge.net/languagetool/?rev=9402&view=rev
Author: dnaber
Date: 2013-02-17 13:48:14 +0000 (Sun, 17 Feb 2013)
Log Message:
-----------
remove Bliki, all Wikipedia text extraction is now done with Sweble
Modified Paths:
--------------
trunk/languagetool/languagetool-wikipedia/pom.xml
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextFilterTools.java
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaTextFilterTest.java
Removed Paths:
-------------
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BlikiWikipediaTextFilter.java
Modified: trunk/languagetool/languagetool-wikipedia/pom.xml
===================================================================
--- trunk/languagetool/languagetool-wikipedia/pom.xml 2013-02-17 09:53:18 UTC
(rev 9401)
+++ trunk/languagetool/languagetool-wikipedia/pom.xml 2013-02-17 13:48:14 UTC
(rev 9402)
@@ -122,11 +122,6 @@
<version>1.1.0</version>
</dependency>
<dependency>
- <groupId>info.bliki.wiki</groupId>
- <artifactId>bliki-core</artifactId>
- <version>3.0.19</version>
- </dependency>
- <dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>4.0.0</version>
@@ -143,17 +138,4 @@
</dependency>
</dependencies>
- <repositories>
- <repository>
- <id>info-bliki-repository</id>
- <url>http://gwtwiki.googlecode.com/svn/maven-repository/</url>
- <releases>
- <enabled>true</enabled>
- </releases>
- <snapshots>
- <enabled>false</enabled>
- </snapshots>
- </repository>
- </repositories>
-
</project>
\ No newline at end of file
Modified:
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
===================================================================
---
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
2013-02-17 09:53:18 UTC (rev 9401)
+++
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BaseWikipediaDumpHandler.java
2013-02-17 13:48:14 UTC (rev 9402)
@@ -56,7 +56,7 @@
private StringBuilder text = new StringBuilder();
private String title;
- private TextFilter textFilter = new BlikiWikipediaTextFilter();
+ private TextFilter textFilter = new SwebleWikipediaTextFilter();
//===========================================================
// SAX DocumentHandler methods
Deleted:
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BlikiWikipediaTextFilter.java
===================================================================
---
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BlikiWikipediaTextFilter.java
2013-02-17 09:53:18 UTC (rev 9401)
+++
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/BlikiWikipediaTextFilter.java
2013-02-17 13:48:14 UTC (rev 9402)
@@ -1,55 +0,0 @@
-/* LanguageTool, a natural language style checker
- * Copyright (C) 2010 Daniel Naber (http://www.danielnaber.de)
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
- * USA
- */
-package org.languagetool.dev.wikipedia;
-
-import info.bliki.wiki.model.WikiModel;
-
-import org.apache.commons.lang.StringEscapeUtils;
-import org.languagetool.TextFilter;
-
-/**
- * Convert Wikipedia syntax to HTML using Bliki and then try to clean it up
(this is
- * rather ugly).
- */
-class BlikiWikipediaTextFilter implements TextFilter {
-
- public String filter(String s) {
- // TODO: find general HTML to Text converter?!:
- final WikiModel wikiModel = new WikiModel("${image}", "${title}");
- // image with link:
- s = s.replaceAll("\\[\\[Datei:.*?\\[\\[.*?\\]\\].*?\\]\\]", "");
- // image without link:
- s = s.replaceAll("\\[\\[Datei:.*?\\]\\]", "");
- s = wikiModel.render(s);
- //System.out.println("0####"+s);
- s = s.replaceAll("\\{\\{.*?\\}\\}", "");
- s = s.replaceAll("</p>", "\n\n");
- s = s.replaceAll("</dt>", "\n\n");
- s = s.replaceAll("</dl>", "\n\n");
- s = s.replaceAll("</h\\d>", "\n\n");
- s = s.replaceAll("<a
href=\"http://[a-zA-Z-]+\\.wikipedia\\.org/wiki/.*?\">.*?</a>", "");
- s = s.replaceAll("<.*?>", "");
- s = s.replaceAll("\n\n*", "\n\n"); // single line break isn't detected
as paragraph in LT by default
- s = StringEscapeUtils.unescapeHtml(s);
- //System.out.println("1############################################\n"+s);
- //System.out.println("/############################################"+s);
- return s.trim();
- }
-
-}
Modified:
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextFilterTools.java
===================================================================
---
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextFilterTools.java
2013-02-17 09:53:18 UTC (rev 9401)
+++
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/TextFilterTools.java
2013-02-17 13:48:14 UTC (rev 9402)
@@ -19,6 +19,7 @@
package org.languagetool.dev.wikipedia;
import org.languagetool.Language;
+import org.languagetool.TextFilter;
/**
* Helper class.
@@ -28,10 +29,10 @@
private TextFilterTools() {
}
- static BlikiWikipediaTextFilter getTextFilter(Language lang) {
- final BlikiWikipediaTextFilter textFilter;
+ static TextFilter getTextFilter(Language lang) {
+ final SwebleWikipediaTextFilter textFilter;
if (lang.getShortName().equals("ro")) {
- textFilter = new BlikiWikipediaTextFilter() {
+ textFilter = new SwebleWikipediaTextFilter() {
@Override
public String filter(String arg0) {
final String tmp = super.filter(arg0);
@@ -40,7 +41,7 @@
}
};
} else {
- textFilter = new BlikiWikipediaTextFilter();
+ textFilter = new SwebleWikipediaTextFilter();
}
return textFilter;
}
Modified:
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
===================================================================
---
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
2013-02-17 09:53:18 UTC (rev 9401)
+++
trunk/languagetool/languagetool-wikipedia/src/main/java/org/languagetool/dev/wikipedia/WikipediaIndexHandler.java
2013-02-17 13:48:14 UTC (rev 9402)
@@ -38,7 +38,6 @@
import org.xml.sax.helpers.DefaultHandler;
/**
- *
* Wikipedia handler for indexing. See {@link
org.languagetool.dev.index.Searcher} for a
* class that lets you use this index.
*
@@ -62,7 +61,7 @@
private boolean inTitle = false;
private StringBuilder text = new StringBuilder();
private StringBuilder title = new StringBuilder();
- private TextFilter textFilter = new BlikiWikipediaTextFilter();
+ private TextFilter textFilter = new SwebleWikipediaTextFilter();
// ===========================================================
// SAX DocumentHandler methods
@@ -73,7 +72,7 @@
this.start = start;
this.end = end;
if (start > end && end != 0) {
- throw new RuntimeException("\"start\" should be smaller than \"end\"");
+ throw new RuntimeException("\"start\" should be smaller than \"end\": "
+ start + ", " + end);
}
textFilter = TextFilterTools.getTextFilter(language);
}
Modified:
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaTextFilterTest.java
===================================================================
---
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaTextFilterTest.java
2013-02-17 09:53:18 UTC (rev 9401)
+++
trunk/languagetool/languagetool-wikipedia/src/test/java/org/languagetool/dev/wikipedia/WikipediaTextFilterTest.java
2013-02-17 13:48:14 UTC (rev 9402)
@@ -22,18 +22,15 @@
public class WikipediaTextFilterTest extends TestCase {
- final BlikiWikipediaTextFilter blikiFilter = new BlikiWikipediaTextFilter();
final SwebleWikipediaTextFilter swebleFilter = new
SwebleWikipediaTextFilter();
public void testImageRemoval() throws Exception {
final String input = "foo [[Datei:Bundesarchiv Bild
183-1990-0803-017.jpg|miniatur|Mit Lothar de Maizière im August 1990]] bar";
- assertEquals("foo bar", blikiFilter.filter(input));
assertEquals("foo bar", swebleFilter.filter(input));
}
public void testRemovalOfImageWithLink() throws Exception {
final String input = "foo [[Datei:Bundesarchiv Bild
183-1990-0803-017.jpg|miniatur|Mit [[Lothar de Maizière]] im August 1990]] bar
[[Link]]";
- assertEquals("foo bar Link", blikiFilter.filter(input));
assertEquals("foo bar Link", swebleFilter.filter(input));
}
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
The Go Parallel Website, sponsored by Intel - in partnership with Geeknet,
is your hub for all things parallel software development, from weekly thought
leadership blogs to news, videos, case studies, tutorials, tech docs,
whitepapers, evaluation guides, and opinion stories. Check out the most
recent posts - join the conversation now. http://goparallel.sourceforge.net/
_______________________________________________
Languagetool-commits mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/languagetool-commits