Updated TextLangDetector and fixed build errors
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ea0e68b4 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ea0e68b4 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ea0e68b4 Branch: refs/heads/master Commit: ea0e68b43d3823834f01ea0048f77e7277404f03 Parents: eafe280 Author: trevorlewis <[email protected]> Authored: Fri Apr 22 11:12:26 2016 -0700 Committer: trevorlewis <[email protected]> Committed: Fri Apr 22 11:12:26 2016 -0700 ---------------------------------------------------------------------- tika-core/pom.xml | 5 + tika-example/pom.xml | 2 + tika-langdetect/pom.xml | 21 ++- .../tika/langdetect/TextLangDetector.java | 139 +++++++------------ .../tika/langdetect/TextLangDetectorTest.java | 12 +- tika-translate/pom.xml | 2 +- 6 files changed, 74 insertions(+), 107 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-core/pom.xml ---------------------------------------------------------------------- diff --git a/tika-core/pom.xml b/tika-core/pom.xml index 558d1b1..4776e48 100644 --- a/tika-core/pom.xml +++ b/tika-core/pom.xml @@ -113,6 +113,11 @@ <configuration> <excludes> <exlude>org/apache/tika/config/TikaActivator</exlude> + <exlude>org/apache/tika/language/LanguageIdentifier</exlude> + <exlude>org/apache/tika/language/LanguageProfile</exlude> + <exlude>org/apache/tika/language/LanguageProfilerBuilder</exlude> + <exlude>org/apache/tika/language/ProfilingHandler</exlude> + <exlude>org/apache/tika/language/ProfilingWriter</exlude> <exlude>org/apache/tika/metadata/Property$PropertyType</exlude> <exlude>org/apache/tika/metadata/Property$ValueType</exlude> <exlude>org/apache/tika/metadata/DublinCore</exlude> http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-example/pom.xml ---------------------------------------------------------------------- diff --git a/tika-example/pom.xml b/tika-example/pom.xml index fedb25c..276978b 100644 --- a/tika-example/pom.xml +++ b/tika-example/pom.xml @@ -116,6 +116,7 @@ <artifactId>junit</artifactId> <scope>test</scope> </dependency> + <!-- <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-test-resources</artifactId> @@ -123,6 +124,7 @@ <type>test-jar</type> <scope>test</scope> </dependency> + --> </dependencies> <description>This module contains examples of how to use Apache Tika.</description> http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-langdetect/pom.xml ---------------------------------------------------------------------- diff --git a/tika-langdetect/pom.xml b/tika-langdetect/pom.xml index f86bd90..2cc2b37 100644 --- a/tika-langdetect/pom.xml +++ b/tika-langdetect/pom.xml @@ -25,7 +25,7 @@ <parent> <groupId>org.apache.tika</groupId> <artifactId>tika-parent</artifactId> - <version>2.0-SNAPSHOT</version> + <version>1.13-SNAPSHOT</version> <relativePath>../tika-parent/pom.xml</relativePath> </parent> @@ -33,6 +33,10 @@ <packaging>bundle</packaging> <name>Apache Tika language detection</name> <url>http://tika.apache.org/</url> + + <properties> + <cxf.version>3.0.3</cxf.version> + </properties> <dependencies> <dependency> @@ -46,6 +50,11 @@ <version>0.5</version> </dependency> <dependency> + <groupId>org.apache.cxf</groupId> + <artifactId>cxf-rt-rs-client</artifactId> + <version>${cxf.version}</version> + </dependency> + <dependency> <groupId>com.google.code.gson</groupId> <artifactId>gson</artifactId> <version>2.6.1</version> @@ -62,16 +71,6 @@ <artifactId>slf4j-log4j12</artifactId> <scope>test</scope> </dependency> - <dependency> - <groupId>com.googlecode.json-simple</groupId> - <artifactId>json-simple</artifactId> - <version>1.1</version> - </dependency> - <dependency> - <groupId>com.googlecode.json-simple</groupId> - <artifactId>json-simple</artifactId> - <version>1.1.1</version> - </dependency> </dependencies> <build> http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-langdetect/src/main/java/org/apache/tika/langdetect/TextLangDetector.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/main/java/org/apache/tika/langdetect/TextLangDetector.java b/tika-langdetect/src/main/java/org/apache/tika/langdetect/TextLangDetector.java index 29c6527..89fbfe5 100644 --- a/tika-langdetect/src/main/java/org/apache/tika/langdetect/TextLangDetector.java +++ b/tika-langdetect/src/main/java/org/apache/tika/langdetect/TextLangDetector.java @@ -19,70 +19,41 @@ package org.apache.tika.langdetect; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonParser; - +import org.apache.cxf.jaxrs.client.WebClient; import org.apache.tika.language.detect.LanguageConfidence; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageResult; -import java.io.*; -import java.net.ConnectException; -import java.net.HttpURLConnection; -import java.net.MalformedURLException; -import java.net.URL; +import javax.ws.rs.core.Response; +import java.io.CharArrayWriter; +import java.io.IOException; import java.util.*; + /** * Created by trevorlewis on 3/7/16. */ /** * Language Detection using MIT Lincoln Labâs Text.jl library - * https://github.com/trevorlewis/TEXT-Language-REST + * https://github.com/trevorlewis/TextREST.jl * - * Please run the Julia lidHttpServer.jl before using this. + * Please run the TextREST.jl server before using this. */ public class TextLangDetector extends LanguageDetector { + private static final String TEXT_REST_HOST = "http://localhost:8000"; + private static final String TEXT_LID_PATH = "/lid"; + + private static String restHostUrlStr; + private Set<String> languages; private CharArrayWriter writer; - private static URL url; - private static HttpURLConnection con = null; - private static OutputStreamWriter out = null; - private static InputStreamReader in = null; - public TextLangDetector(){ super(); - + restHostUrlStr = TEXT_REST_HOST; + languages = getAllLanguages(); writer = new CharArrayWriter(); - - try { - url = new URL("http://127.0.0.1:8000"); - } catch (MalformedURLException e) { - e.printStackTrace(); - } - - try { - con = (HttpURLConnection) url.openConnection(); - con.setRequestMethod("GET"); - - int responseCode = con.getResponseCode(); - if (responseCode == 200) { - languages = new HashSet<String>(); - in = new InputStreamReader(con.getInputStream()); - String json = getStringFromInputStreamReader(in); - JsonArray jsonArray = new JsonParser().parse(json).getAsJsonArray(); - for (JsonElement jsonElement: jsonArray) { - languages.add(jsonElement.toString()); - } - in.close(); - } - - con.disconnect(); - } catch (ConnectException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } } @Override @@ -119,63 +90,57 @@ public class TextLangDetector extends LanguageDetector { @Override public List<LanguageResult> detectAll() { List<LanguageResult> result = new ArrayList<>(); - - result.add(new LanguageResult(detect(writer.toString()), LanguageConfidence.MEDIUM, 0)); - + String language = detect(writer.toString()); + if (language != null) { + result.add(new LanguageResult(language, LanguageConfidence.MEDIUM, 1)); + } else { + result.add(new LanguageResult(language, LanguageConfidence.NONE, 0)); + } return result; } - private String detect(String content){ - String language = "error"; - + private Set<String> getAllLanguages() { + Set<String> languages = new HashSet<>(); try { - con = (HttpURLConnection) url.openConnection(); - con.setRequestMethod("PUT"); - con.setDoOutput(true); - - out = new OutputStreamWriter(con.getOutputStream()); - out.write(content); - out.close(); - - int responseCode = con.getResponseCode(); - if (responseCode == 200) { - in = new InputStreamReader(con.getInputStream()); - String json = getStringFromInputStreamReader(in); - language = new JsonParser().parse(json).getAsJsonObject().get("lang").getAsString(); - in.close(); + Response response = WebClient + .create(restHostUrlStr + TEXT_LID_PATH) + .get(); + String json = response.readEntity(String.class); + JsonArray jsonArray = new JsonParser().parse(json).getAsJsonObject().get("all_languages").getAsJsonArray(); + for (JsonElement jsonElement : jsonArray) { + languages.add(jsonElement.toString()); } - - con.disconnect(); - } catch (ConnectException e) { - e.printStackTrace(); - } catch (IOException e) { + } catch (Exception e) { e.printStackTrace(); } + return languages; + } + private String detect(String content) { + String language = null; + try { + Response response = WebClient + .create(restHostUrlStr + TEXT_LID_PATH) + .put(content); + String json = response.readEntity(String.class); + language = new JsonParser().parse(json).getAsJsonObject().get("language").getAsString(); + } catch (Exception e) { + e.printStackTrace(); + } return language; } - // convert InputStreamReader to String - private String getStringFromInputStreamReader(InputStreamReader in) { - BufferedReader br = null; - StringBuilder sb = new StringBuilder(); - String line; + protected static boolean canRun() { try { - br = new BufferedReader(in); - while ((line = br.readLine()) != null) { - sb.append(line); - } - } catch (IOException e) { + Response response = WebClient + .create(TEXT_REST_HOST + TEXT_LID_PATH) + .get(); + String json = response.readEntity(String.class); + JsonArray jsonArray = new JsonParser().parse(json).getAsJsonObject().get("all_languages").getAsJsonArray(); + return jsonArray.size() != 0; + } catch (Exception e) { e.printStackTrace(); - } finally { - if (br != null) { - try { - br.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } + return false; } - return sb.toString(); } } http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-langdetect/src/test/java/org/apache/tika/langdetect/TextLangDetectorTest.java ---------------------------------------------------------------------- diff --git a/tika-langdetect/src/test/java/org/apache/tika/langdetect/TextLangDetectorTest.java b/tika-langdetect/src/test/java/org/apache/tika/langdetect/TextLangDetectorTest.java index 9b4bdd1..d2fe26c 100644 --- a/tika-langdetect/src/test/java/org/apache/tika/langdetect/TextLangDetectorTest.java +++ b/tika-langdetect/src/test/java/org/apache/tika/langdetect/TextLangDetectorTest.java @@ -26,6 +26,7 @@ import java.util.List; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; +import static org.junit.Assume.assumeTrue; /** * Created by trevorlewis on 3/7/16. @@ -34,17 +35,15 @@ public class TextLangDetectorTest { @Test public void test() throws Exception { - LanguageDetector detector = new TextLangDetector(); + assumeTrue(TextLangDetector.canRun()); + LanguageDetector detector = new TextLangDetector(); LanguageWriter writer = new LanguageWriter(detector); List<String> lines = IOUtils.readLines(TextLangDetectorTest.class.getResourceAsStream("text-test.tsv")); - for (String line : lines) { String[] data = line.split("\t"); - if (data.length != 2) { - continue; - } + if (data.length != 2) continue; writer.reset(); writer.append(data[1]); @@ -52,9 +51,6 @@ public class TextLangDetectorTest { LanguageResult result = detector.detect(); assertNotNull(result); - /*if (!data[0].equals(result.getLanguage())) { - System.out.println(result.getLanguage() + " : " + data[0] + " - " + data[1]); - }*/ assertEquals(data[0], result.getLanguage()); } http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-translate/pom.xml ---------------------------------------------------------------------- diff --git a/tika-translate/pom.xml b/tika-translate/pom.xml index 3513c4d..f77b8c4 100644 --- a/tika-translate/pom.xml +++ b/tika-translate/pom.xml @@ -54,7 +54,7 @@ <dependency> <groupId>org.apache.cxf</groupId> <artifactId>cxf-rt-frontend-jaxrs</artifactId> - <version>2.7.8</version> + <version>3.0.3</version> </dependency> <dependency> <groupId>com.fasterxml.jackson.jaxrs</groupId>
