Updated TextLangDetector and fixed build errors

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ea0e68b4
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ea0e68b4
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ea0e68b4

Branch: refs/heads/master
Commit: ea0e68b43d3823834f01ea0048f77e7277404f03
Parents: eafe280
Author: trevorlewis <[email protected]>
Authored: Fri Apr 22 11:12:26 2016 -0700
Committer: trevorlewis <[email protected]>
Committed: Fri Apr 22 11:12:26 2016 -0700

----------------------------------------------------------------------
 tika-core/pom.xml                               |   5 +
 tika-example/pom.xml                            |   2 +
 tika-langdetect/pom.xml                         |  21 ++-
 .../tika/langdetect/TextLangDetector.java       | 139 +++++++------------
 .../tika/langdetect/TextLangDetectorTest.java   |  12 +-
 tika-translate/pom.xml                          |   2 +-
 6 files changed, 74 insertions(+), 107 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-core/pom.xml
----------------------------------------------------------------------
diff --git a/tika-core/pom.xml b/tika-core/pom.xml
index 558d1b1..4776e48 100644
--- a/tika-core/pom.xml
+++ b/tika-core/pom.xml
@@ -113,6 +113,11 @@
             <configuration>
               <excludes>
                 <exlude>org/apache/tika/config/TikaActivator</exlude>
+                <exlude>org/apache/tika/language/LanguageIdentifier</exlude>
+                <exlude>org/apache/tika/language/LanguageProfile</exlude>
+                
<exlude>org/apache/tika/language/LanguageProfilerBuilder</exlude>
+                <exlude>org/apache/tika/language/ProfilingHandler</exlude>
+                <exlude>org/apache/tika/language/ProfilingWriter</exlude>
                 <exlude>org/apache/tika/metadata/Property$PropertyType</exlude>
                 <exlude>org/apache/tika/metadata/Property$ValueType</exlude>
                 <exlude>org/apache/tika/metadata/DublinCore</exlude>

http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-example/pom.xml
----------------------------------------------------------------------
diff --git a/tika-example/pom.xml b/tika-example/pom.xml
index fedb25c..276978b 100644
--- a/tika-example/pom.xml
+++ b/tika-example/pom.xml
@@ -116,6 +116,7 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
+    <!--
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-test-resources</artifactId>
@@ -123,6 +124,7 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    -->
   </dependencies>
 
   <description>This module contains examples of how to use Apache 
Tika.</description>

http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-langdetect/pom.xml
----------------------------------------------------------------------
diff --git a/tika-langdetect/pom.xml b/tika-langdetect/pom.xml
index f86bd90..2cc2b37 100644
--- a/tika-langdetect/pom.xml
+++ b/tika-langdetect/pom.xml
@@ -25,7 +25,7 @@
   <parent>
     <groupId>org.apache.tika</groupId>
     <artifactId>tika-parent</artifactId>
-    <version>2.0-SNAPSHOT</version>
+    <version>1.13-SNAPSHOT</version>
     <relativePath>../tika-parent/pom.xml</relativePath>
   </parent>
 
@@ -33,6 +33,10 @@
   <packaging>bundle</packaging>
   <name>Apache Tika language detection</name>
   <url>http://tika.apache.org/</url>
+
+  <properties>
+    <cxf.version>3.0.3</cxf.version>
+  </properties>
   
   <dependencies>
     <dependency>
@@ -46,6 +50,11 @@
       <version>0.5</version>
     </dependency>
     <dependency>
+      <groupId>org.apache.cxf</groupId>
+      <artifactId>cxf-rt-rs-client</artifactId>
+      <version>${cxf.version}</version>
+    </dependency>
+    <dependency>
       <groupId>com.google.code.gson</groupId>
       <artifactId>gson</artifactId>
       <version>2.6.1</version>
@@ -62,16 +71,6 @@
       <artifactId>slf4j-log4j12</artifactId>
       <scope>test</scope>
     </dependency>
-    <dependency>
-      <groupId>com.googlecode.json-simple</groupId>
-      <artifactId>json-simple</artifactId>
-      <version>1.1</version>
-    </dependency>
-    <dependency>
-      <groupId>com.googlecode.json-simple</groupId>
-      <artifactId>json-simple</artifactId>
-      <version>1.1.1</version>
-    </dependency>
   </dependencies>
   
   <build>

http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-langdetect/src/main/java/org/apache/tika/langdetect/TextLangDetector.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/TextLangDetector.java
 
b/tika-langdetect/src/main/java/org/apache/tika/langdetect/TextLangDetector.java
index 29c6527..89fbfe5 100644
--- 
a/tika-langdetect/src/main/java/org/apache/tika/langdetect/TextLangDetector.java
+++ 
b/tika-langdetect/src/main/java/org/apache/tika/langdetect/TextLangDetector.java
@@ -19,70 +19,41 @@ package org.apache.tika.langdetect;
 import com.google.gson.JsonArray;
 import com.google.gson.JsonElement;
 import com.google.gson.JsonParser;
-
+import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.tika.language.detect.LanguageConfidence;
 import org.apache.tika.language.detect.LanguageDetector;
 import org.apache.tika.language.detect.LanguageResult;
 
-import java.io.*;
-import java.net.ConnectException;
-import java.net.HttpURLConnection;
-import java.net.MalformedURLException;
-import java.net.URL;
+import javax.ws.rs.core.Response;
+import java.io.CharArrayWriter;
+import java.io.IOException;
 import java.util.*;
 
+
 /**
  * Created by trevorlewis on 3/7/16.
  */
 /**
  * Language Detection using MIT Lincoln Lab’s Text.jl library
- * https://github.com/trevorlewis/TEXT-Language-REST
+ * https://github.com/trevorlewis/TextREST.jl
  *
- * Please run the Julia lidHttpServer.jl before using this.
+ * Please run the TextREST.jl server before using this.
  */
 public class TextLangDetector extends LanguageDetector {
 
+    private static final String TEXT_REST_HOST = "http://localhost:8000";;
+    private static final String TEXT_LID_PATH = "/lid";
+
+    private static String restHostUrlStr;
+
     private Set<String> languages;
     private CharArrayWriter writer;
 
-    private static URL url;
-    private static HttpURLConnection con = null;
-    private static OutputStreamWriter out = null;
-    private static InputStreamReader in = null;
-
     public TextLangDetector(){
         super();
-
+        restHostUrlStr = TEXT_REST_HOST;
+        languages = getAllLanguages();
         writer = new CharArrayWriter();
-
-        try {
-            url = new URL("http://127.0.0.1:8000";);
-        } catch (MalformedURLException e) {
-            e.printStackTrace();
-        }
-
-        try {
-            con = (HttpURLConnection) url.openConnection();
-            con.setRequestMethod("GET");
-
-            int responseCode = con.getResponseCode();
-            if (responseCode == 200) {
-                languages = new HashSet<String>();
-                in = new InputStreamReader(con.getInputStream());
-                String json = getStringFromInputStreamReader(in);
-                JsonArray jsonArray = new 
JsonParser().parse(json).getAsJsonArray();
-                for (JsonElement jsonElement: jsonArray) {
-                    languages.add(jsonElement.toString());
-                }
-                in.close();
-            }
-
-            con.disconnect();
-        } catch (ConnectException e) {
-            e.printStackTrace();
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
     }
 
     @Override
@@ -119,63 +90,57 @@ public class TextLangDetector extends LanguageDetector {
     @Override
     public List<LanguageResult> detectAll() {
         List<LanguageResult> result = new ArrayList<>();
-
-        result.add(new LanguageResult(detect(writer.toString()), 
LanguageConfidence.MEDIUM, 0));
-
+        String language = detect(writer.toString());
+        if (language != null) {
+            result.add(new LanguageResult(language, LanguageConfidence.MEDIUM, 
1));
+        } else {
+            result.add(new LanguageResult(language, LanguageConfidence.NONE, 
0));
+        }
         return result;
     }
 
-    private String detect(String content){
-        String language = "error";
-
+    private Set<String> getAllLanguages() {
+        Set<String> languages = new HashSet<>();
         try {
-            con = (HttpURLConnection) url.openConnection();
-            con.setRequestMethod("PUT");
-            con.setDoOutput(true);
-
-            out = new OutputStreamWriter(con.getOutputStream());
-            out.write(content);
-            out.close();
-
-            int responseCode = con.getResponseCode();
-            if (responseCode == 200) {
-                in = new InputStreamReader(con.getInputStream());
-                String json = getStringFromInputStreamReader(in);
-                language = new 
JsonParser().parse(json).getAsJsonObject().get("lang").getAsString();
-                in.close();
+            Response response = WebClient
+                    .create(restHostUrlStr + TEXT_LID_PATH)
+                    .get();
+            String json = response.readEntity(String.class);
+            JsonArray jsonArray = new 
JsonParser().parse(json).getAsJsonObject().get("all_languages").getAsJsonArray();
+            for (JsonElement jsonElement : jsonArray) {
+                languages.add(jsonElement.toString());
             }
-
-            con.disconnect();
-        } catch (ConnectException e) {
-            e.printStackTrace();
-        } catch (IOException e) {
+        } catch (Exception e) {
             e.printStackTrace();
         }
+        return languages;
+    }
 
+    private String detect(String content) {
+        String language = null;
+        try {
+            Response response = WebClient
+                    .create(restHostUrlStr + TEXT_LID_PATH)
+                    .put(content);
+            String json = response.readEntity(String.class);
+            language = new 
JsonParser().parse(json).getAsJsonObject().get("language").getAsString();
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
         return language;
     }
 
-    // convert InputStreamReader to String
-    private String getStringFromInputStreamReader(InputStreamReader in) {
-        BufferedReader br = null;
-        StringBuilder sb = new StringBuilder();
-        String line;
+    protected static boolean canRun() {
         try {
-            br = new BufferedReader(in);
-            while ((line = br.readLine()) != null) {
-                sb.append(line);
-            }
-        } catch (IOException e) {
+            Response response = WebClient
+                    .create(TEXT_REST_HOST + TEXT_LID_PATH)
+                    .get();
+            String json = response.readEntity(String.class);
+            JsonArray jsonArray = new 
JsonParser().parse(json).getAsJsonObject().get("all_languages").getAsJsonArray();
+            return jsonArray.size() != 0;
+        } catch (Exception e) {
             e.printStackTrace();
-        } finally {
-            if (br != null) {
-                try {
-                    br.close();
-                } catch (IOException e) {
-                    e.printStackTrace();
-                }
-            }
+            return false;
         }
-        return sb.toString();
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-langdetect/src/test/java/org/apache/tika/langdetect/TextLangDetectorTest.java
----------------------------------------------------------------------
diff --git 
a/tika-langdetect/src/test/java/org/apache/tika/langdetect/TextLangDetectorTest.java
 
b/tika-langdetect/src/test/java/org/apache/tika/langdetect/TextLangDetectorTest.java
index 9b4bdd1..d2fe26c 100644
--- 
a/tika-langdetect/src/test/java/org/apache/tika/langdetect/TextLangDetectorTest.java
+++ 
b/tika-langdetect/src/test/java/org/apache/tika/langdetect/TextLangDetectorTest.java
@@ -26,6 +26,7 @@ import java.util.List;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assume.assumeTrue;
 
 /**
  * Created by trevorlewis on 3/7/16.
@@ -34,17 +35,15 @@ public class TextLangDetectorTest {
 
     @Test
     public void test() throws Exception {
-        LanguageDetector detector = new TextLangDetector();
+        assumeTrue(TextLangDetector.canRun());
 
+        LanguageDetector detector = new TextLangDetector();
         LanguageWriter writer = new LanguageWriter(detector);
 
         List<String> lines = 
IOUtils.readLines(TextLangDetectorTest.class.getResourceAsStream("text-test.tsv"));
-
         for (String line : lines) {
             String[] data = line.split("\t");
-            if (data.length != 2) {
-                continue;
-            }
+            if (data.length != 2) continue;
 
             writer.reset();
             writer.append(data[1]);
@@ -52,9 +51,6 @@ public class TextLangDetectorTest {
             LanguageResult result = detector.detect();
             assertNotNull(result);
 
-            /*if (!data[0].equals(result.getLanguage())) {
-                System.out.println(result.getLanguage() + " : " + data[0] + " 
- " + data[1]);
-            }*/
             assertEquals(data[0], result.getLanguage());
         }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/ea0e68b4/tika-translate/pom.xml
----------------------------------------------------------------------
diff --git a/tika-translate/pom.xml b/tika-translate/pom.xml
index 3513c4d..f77b8c4 100644
--- a/tika-translate/pom.xml
+++ b/tika-translate/pom.xml
@@ -54,7 +54,7 @@
     <dependency>
       <groupId>org.apache.cxf</groupId>
       <artifactId>cxf-rt-frontend-jaxrs</artifactId>
-      <version>2.7.8</version>
+      <version>3.0.3</version>
     </dependency>
     <dependency>
       <groupId>com.fasterxml.jackson.jaxrs</groupId>

Reply via email to