This is an automated email from the ASF dual-hosted git repository. mattmann pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 2c4ba60e2ce8f3388b92660f842fd63787ee4a5e Author: Chris Mattmann <chris.mattm...@gmail.com> AuthorDate: Sun May 2 08:10:51 2021 -0700 Backport Merge branch 'TIKA-3329' of https://github.com/thammegowda/tika into main --- .../tika/language/translate/RTGTranslator.java | 142 +++++++++++++++++++++ .../org.apache.tika.language.translate.Translator | 1 + .../tika/language/translate/RTGTranslatorTest.java | 62 +++++++++ 3 files changed, 205 insertions(+) diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/RTGTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/RTGTranslator.java new file mode 100644 index 0000000..ef366e2 --- /dev/null +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/RTGTranslator.java @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.tika.language.translate; + +import com.fasterxml.jackson.jaxrs.json.JacksonJsonProvider; +import org.apache.cxf.jaxrs.client.WebClient; +import org.apache.tika.exception.TikaException; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; + + +/** + * <p>This translator is designed to work with a TCP-IP available + * RTG translation server, specifically the + * <a href="https://isi-nlp.github.io/rtg/#_rtg_serve"> + * REST-based RTG server</a>.</p> + * To get Docker image: + * https://hub.docker.com/repository/docker/tgowda/rtg-model <br/> + * <pre> + * {code + * # without GPU + * docker run --rm -i -p 6060:6060 tgowda/rtg-model:500toEng-v1 + * # Or, with GPU device 0 + * docker run --rm -i -p 6060:6060 --gpus '"device=0"' tgowda/rtg-model:500toEng-v1 + * } + * </pre> + * + * <p>If you were to interact with the server via curl a request + * would look as follows</p> + * + * <pre> + * {code + * curl --data "source=Comment allez-vous?" \ + * --data "source=Bonne journée" \ + * http://localhost:6060/translate + * } + * </pre> + * + * RTG requires input to be pre-formatted into sentences, one per line, + * so this translation implementation takes care of that. + */ +public class RTGTranslator extends AbstractTranslator { + + public static final String RTG_TRANSLATE_URL_BASE = "http://localhost:6060"; + public static final String RTG_PROPS = "translator.rtg.properties"; + private static final Logger LOG = LoggerFactory.getLogger(RTGTranslator.class); + private WebClient client; + private boolean isAvailable = false; + + public RTGTranslator() { + String rtgBaseUrl = RTG_TRANSLATE_URL_BASE; + Properties config = new Properties(); + try (InputStream stream = getClass().getClassLoader().getResourceAsStream(RTG_PROPS)){ + if (stream != null){ + config.load(stream); + } + rtgBaseUrl = config.getProperty("rtg.base.url", rtgBaseUrl); + } catch (IOException e) { + LOG.warn(e.getMessage(), e); + } + LOG.info("RTG base URL: " + rtgBaseUrl); + List<Object> providers = new ArrayList<>(); + providers.add(new JacksonJsonProvider()); + try { + this.client = WebClient.create(rtgBaseUrl, providers); + this.isAvailable = client.head().getStatus() == 200; + } catch (Exception e){ + LOG.warn(e.getMessage(), e); + isAvailable = false; + } + + } + @Override + public String translate(String text, String sourceLanguage, String targetLanguage) + throws TikaException, IOException { + return this.translate(text); + } + + @Override + public String translate(String text, String targetLanguage) + throws TikaException, IOException { + return this.translate(text); + } + + public String translate(String text) throws TikaException, IOException { + if (!this.isAvailable) { + return text; + } + Map<String, List<Object>> input = new HashMap<>(); + input.put("source", Arrays.asList(text.split("(?<=(?<![A-Z])\\. )|\\n"))); + Response response = client.path("translate") + .type(MediaType.APPLICATION_JSON) + .accept(MediaType.APPLICATION_JSON) + .post(input); + try (InputStreamReader reader = new InputStreamReader( + (InputStream) response.getEntity())) { + JSONParser parser = new JSONParser(); + JSONObject obj = (JSONObject) parser.parse(reader); + List<String> sentences = (List<String>) obj.get("translation"); + String output = String.join("\n", sentences); + return output; + } catch (ParseException e){ + throw new IOException(e.getMessage(), e); + } + } + + @Override + public boolean isAvailable() { + return this.isAvailable; + } +} diff --git a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator index f3dcad4..154beca 100644 --- a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator +++ b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator @@ -18,3 +18,4 @@ org.apache.tika.language.translate.GoogleTranslator org.apache.tika.language.translate.Lingo24Translator org.apache.tika.language.translate.CachedTranslator org.apache.tika.language.translate.JoshuaNetworkTranslator +org.apache.tika.language.translate.RTGTranslator diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/RTGTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/RTGTranslatorTest.java new file mode 100644 index 0000000..0c18e0e --- /dev/null +++ b/tika-translate/src/test/java/org/apache/tika/language/translate/RTGTranslatorTest.java @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.language.translate; + +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.fail; + + +/** + * Test harness for the {@link RTGTranslator}. + * + */ +public class RTGTranslatorTest { + + private RTGTranslator translator; + + + @Before + public void setUp() { + translator = new RTGTranslator(); + } + + @Test + public void testSimpleTranslate() { + String source = "hola señor"; + String expected = "hello sir"; + + String result = null; + if (translator.isAvailable()) { + try { + result = translator.translate(source); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result.toLowerCase()); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + +}