Author: kkrugler
Date: Fri Jul 9 21:23:49 2010
New Revision: 962698
URL: http://svn.apache.org/viewvc?rev=962698&view=rev
Log:
TIKA-453: Fix Estonian language identifier.
Also added test data for Estonian (Translation of section of Genesis)
Added:
tika/trunk/tika-core/src/main/resources/org/apache/tika/language/et.ngp
- copied unchanged from r961850,
tika/trunk/tika-core/src/main/resources/org/apache/tika/language/ee.ngp
tika/trunk/tika-core/src/test/resources/org/apache/tika/language/et.test
Removed:
tika/trunk/tika-core/src/main/resources/org/apache/tika/language/ee.ngp
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java?rev=962698&r1=962697&r2=962698&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
Fri Jul 9 21:23:49 2010
@@ -73,7 +73,7 @@ public class LanguageIdentifier {
static {
addProfile("da"); // Danish
addProfile("de"); // German
- addProfile("ee");
+ addProfile("et"); // Estonian
addProfile("el"); // Greek
addProfile("en"); // English
addProfile("es"); // Spanish
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java?rev=962698&r1=962697&r2=962698&view=diff
==============================================================================
---
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
(original)
+++
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageIdentifierTest.java
Fri Jul 9 21:23:49 2010
@@ -34,7 +34,9 @@ import org.apache.tika.io.IOUtils;
public class LanguageIdentifierTest extends TestCase {
private static final String[] languages = new String[] {
- "da", "de", /* "el", */ "en", "es", "fi", "fr", "it", "nl", "pt", "sv"
+ // TODO - currently Estonian and Greek fail these tests. Reeable
+ // when language detection works better.
+ "da", "de", /* "et", "el", */ "en", "es", "fi", "fr", "it", "nl",
"pt", "sv"
};
public void testLanguageDetection() throws IOException {
@@ -63,6 +65,16 @@ public class LanguageIdentifierTest exte
}
}
+ // TIKA-453: Fix up language identifier used for Estonian
+ public void testEstonia() throws Exception {
+ final String estonian = "et";
+ ProfilingWriter writer = new ProfilingWriter();
+ writeTo(estonian, writer);
+ LanguageIdentifier identifier =
+ new LanguageIdentifier(writer.getProfile());
+ assertEquals(estonian, identifier.getLanguage());
+ }
+
private void writeTo(String language, Writer writer) throws IOException {
InputStream stream =
LanguageIdentifierTest.class.getResourceAsStream(language +
".test");
Added: tika/trunk/tika-core/src/test/resources/org/apache/tika/language/et.test
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/resources/org/apache/tika/language/et.test?rev=962698&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/resources/org/apache/tika/language/et.test
(added)
+++ tika/trunk/tika-core/src/test/resources/org/apache/tika/language/et.test
Fri Jul 9 21:23:49 2010
@@ -0,0 +1,17 @@
+Kogu maailmas aga oli üks keel ja ühesugused sõnad.
+
+Ja sündis, kui nad hommiku poolt teele läksid, et nad Sinearimaal leidsid
oru ja jäid sinna elama.
+
+Nad ütlesid üksteisele: "Tehkem nüüd telliskive ja põletagem neid
hästi." Siis olid telliskivid neile ehituskivideks ja maapigi oli saviks.
+
+Ja nad ütlesid: "Tulge, ehitagem enestele linn ja torn, mille tipp oleks
taevas, ja tehkem enestele nimi, et me ei hajuks üle kogu maailma!"
+
+Aga Jehoova tuli alla vaatama linna ja torni, mida inimlapsed ehitasid.
+
+Ja Jehoova ütles: "Vaata, rahvas on üks ja neil kõigil on üks keel, ja see
on alles nende tegude algus. Nüüd ei ole neil võimatu ükski asi, mida nad
kavatsevad teha!
+
+Mingem nüüd alla ja segagem seal nende keel, et nad üksteise keelt ei
mõistaks!"
+
+Ja Jehoova pillutas nad sealt üle kogu maailma ja nad jätsid linna ehitamata.
+
+Seepärast pandi sellele nimeks Paabel, sest seal Jehoova segas ära kogu
maailma keele ja sealt pillutas Jehoova nad üle kogu maailma.