fix for TIKA-1943 contributed by Mark Duske

Includes support for Yandex Translate API

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/86145d99
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/86145d99
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/86145d99

Branch: refs/heads/master
Commit: 86145d99df22f6f75f0602e984872bc0ef7e53f1
Parents: f509917
Author: ReEvApp - Re-Evolution Applications, LLC <[email protected]>
Authored: Tue Apr 12 14:01:39 2016 -0400
Committer: ReEvApp - Re-Evolution Applications, LLC <[email protected]>
Committed: Tue Apr 12 14:01:39 2016 -0400

----------------------------------------------------------------------
 .../language/translate/YandexTranslator.java    | 175 +++++++++++++++++++
 1 file changed, 175 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/86145d99/tika-translate/src/main/java/org/apache/tika/language/translate/YandexTranslator.java
----------------------------------------------------------------------
diff --git 
a/tika-translate/src/main/java/org/apache/tika/language/translate/YandexTranslator.java
 
b/tika-translate/src/main/java/org/apache/tika/language/translate/YandexTranslator.java
new file mode 100644
index 0000000..dc0d14c
--- /dev/null
+++ 
b/tika-translate/src/main/java/org/apache/tika/language/translate/YandexTranslator.java
@@ -0,0 +1,175 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.language.translate;
+
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Properties;
+
+import com.fasterxml.jackson.core.JsonParseException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.translate.Translator;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * An implementation of a REST client for the YANDEX <a 
href="https://tech.yandex.com/translate/";>Translate API</a>.
+ * You can sign up for free access online on the <a 
href="https://tech.yandex.com/key/form.xml?service=trnsl";>API Key form</a>
+ * and set your Application's User Key in the 
<code>translator.yandex.properties</code> file.
+ */
+public class YandexTranslator implements Translator {
+    
+    /**
+     * Yandex Translate API service end-point URL
+     */
+    private static final String YANDEX_TRANSLATE_URL_BASE = 
"https://translate.yandex.net/api/v1.5/tr.json/translate";;
+
+    /**
+     * Default USer-Key, a real User-Key must be provided before the Lingo24 
can successfully request translations
+     */
+    private static final String DEFAULT_KEY = "dummy-key";
+
+    /**
+     * Identifies the client of the request, used for authentication 
+     */
+    private String apiKey;
+    
+    /**
+     * The Yandex Translate API can handle text in <b>plain</b> and/or 
<b>html</b> format, the default
+     * format is <b>plain</b>
+     */
+    private String format = "plain";
+
+    public YandexTranslator() {
+        Properties config = new Properties();
+        try {
+            config.load(YandexTranslator.class
+                    .getResourceAsStream(
+                            "translator.yandex.properties"));
+            this.apiKey = config.getProperty("translator.api-key");
+            this.format = config.getProperty("translator.text.format");
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    @Override
+    public String translate(String text, String sourceLanguage,
+            String targetLanguage) throws TikaException, IOException {
+        if (!this.isAvailable()) {
+            return text;
+        }
+        
+        WebClient client = WebClient.create(YANDEX_TRANSLATE_URL_BASE);
+        
+        String langCode;
+        
+        if (sourceLanguage == null) {
+            //Translate Service will identify source language
+            langCode = targetLanguage;
+        } else {
+            //Source language is well known
+            langCode = sourceLanguage + '-' + targetLanguage;
+        }
+
+        //TODO Add support for text over 10k characters
+        Response response = client.accept(MediaType.APPLICATION_JSON)
+                .query("key", this.apiKey).query("lang", langCode)
+                .query("text", text).get();
+        BufferedReader reader = new BufferedReader(new InputStreamReader(
+                (InputStream) response.getEntity(), UTF_8));
+        String line = null;
+        StringBuffer responseText = new StringBuffer();
+        while ((line = reader.readLine()) != null) {
+            responseText.append(line);
+        }
+
+        try {
+            ObjectMapper mapper = new ObjectMapper();
+            JsonNode jsonResp = mapper.readTree(responseText.toString());
+            
+            if (!jsonResp.findValuesAsText("code").isEmpty()) {
+                String code = jsonResp.findValuesAsText("code").get(0);
+                if (code.equals("200")) {
+                    return jsonResp.findValue("text").get(0).asText();
+                } else {
+                    throw new 
TikaException(jsonResp.findValue("message").get(0).asText());
+                }
+            } else {
+                throw new TikaException("Return message not recognized: " + 
responseText.toString().substring(0, Math.min(responseText.length(), 100)));
+            }
+        } catch (JsonParseException e) {
+            throw new TikaException("Error requesting translation from '" + 
sourceLanguage + "' to '" + targetLanguage + "', JSON response from Lingo24 is 
not well formatted: " + responseText.toString());
+        }
+    }
+
+
+    /**
+     * Get the API Key in use for client authentication
+     * @return API Key
+     */
+    public String getApiKey() {
+        return apiKey;
+    }
+
+    /**
+     * Set the API Key for client authentication
+     * @param apiKey API Key
+     */
+    public void setApiKey(String apiKey) {
+        this.apiKey = apiKey;
+    }
+
+    /**
+     * Retrieve the current text format setting.
+     * The Yandex Translate API can handle text in <b>plain</b> and/or 
<b>html</b> format, the default
+     * format is <b>plain</b>
+     * @return
+     */
+    public String getFormat() {
+        return format;
+    }
+
+    /**
+     * Set the text format to use (plain/html)
+     * @param format Text format setting, either plain or html
+     */
+    public void setFormat(String format) {
+        this.format = format;
+    }
+
+    @Override
+    public String translate(String text, String targetLanguage)
+            throws TikaException, IOException {
+        return this.translate(text, null, targetLanguage);
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return this.apiKey!=null && !this.apiKey.equals(DEFAULT_KEY);
+    }
+
+}

Reply via email to