Repository: incubator-joshua Updated Branches: refs/heads/7_confsystem [created] dc6cf9967
Add TypesafeConfig dependency Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/f7513abb Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/f7513abb Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/f7513abb Branch: refs/heads/7_confsystem Commit: f7513abb8f152872c1607f8b77faed1989cf5b37 Parents: 0afbe53 Author: Felix Hieber <fhie...@amazon.com> Authored: Wed Sep 14 09:02:32 2016 +0200 Committer: Hieber, Felix <fhie...@amazon.de> Committed: Thu Sep 15 17:29:11 2016 +0200 ---------------------------------------------------------------------- joshua-core/pom.xml | 5 + .../org/apache/joshua/decoder/ArgsParser.java | 116 -------- .../joshua/decoder/ff/SourceDependentFF.java | 29 -- .../ff/similarity/EdgePhraseSimilarityFF.java | 278 ------------------- 4 files changed, 5 insertions(+), 423 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f7513abb/joshua-core/pom.xml ---------------------------------------------------------------------- diff --git a/joshua-core/pom.xml b/joshua-core/pom.xml index 2dd046b..b2646ca 100644 --- a/joshua-core/pom.xml +++ b/joshua-core/pom.xml @@ -186,6 +186,11 @@ <artifactId>concurrent</artifactId> <version>1.3.4</version> </dependency> + <dependency> + <groupId>com.typesafe</groupId> + <artifactId>config</artifactId> + <version>1.2.1</version> + </dependency> <!-- Test Dependencies --> <dependency> http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f7513abb/joshua-core/src/main/java/org/apache/joshua/decoder/ArgsParser.java ---------------------------------------------------------------------- diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ArgsParser.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ArgsParser.java deleted file mode 100644 index 97baa27..0000000 --- a/joshua-core/src/main/java/org/apache/joshua/decoder/ArgsParser.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.joshua.decoder; - -import java.io.IOException; -import java.nio.charset.Charset; -import java.nio.file.Files; -import java.nio.file.Paths; - -import org.apache.joshua.util.io.LineReader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author orluke - * - */ -public class ArgsParser { - - private static final Logger LOG = LoggerFactory.getLogger(ArgsParser.class); - - private String configFile = null; - - /** - * Parse the arguments passed from the command line when the JoshuaDecoder application was - * executed from the command line. - * - * @param args string array of input arguments - * @param config the {@link org.apache.joshua.decoder.JoshuaConfiguration} - * @throws IOException if there is an error wit the input arguments - */ - public ArgsParser(String[] args, JoshuaConfiguration config) throws IOException { - - /* - * Look for a verbose flag, -v. - * - * Look for an argument to the "-config" flag to find the config file, if any. - */ - if (args.length >= 1) { - // Search for a verbose flag - for (int i = 0; i < args.length; i++) { - if (args[i].equals("-v")) { - Decoder.VERBOSE = Integer.parseInt(args[i + 1].trim()); - config.setVerbosity(Decoder.VERBOSE); - } - - if (args[i].equals("-version")) { - try (LineReader reader = new LineReader(String.format("%s/VERSION", System.getenv("JOSHUA")));) { - reader.readLine(); - String version = reader.readLine().split("\\s+")[2]; - System.out.println(String.format("The Apache Joshua machine translator, version %s", version)); - System.out.println("joshua.incubator.apache.org"); - System.exit(0); - } - } else if (args[i].equals("-license")) { - try { - Files.readAllLines(Paths.get(String.format("%s/../LICENSE", - JoshuaConfiguration.class.getProtectionDomain().getCodeSource().getLocation() - .getPath())), Charset.defaultCharset()).forEach(System.out::println); - } catch (IOException e) { - throw new RuntimeException("FATAL: missing license file!", e); - } - System.exit(0); - } - } - - // Search for the configuration file from the end (so as to take the last one) - for (int i = args.length-1; i >= 0; i--) { - if (args[i].equals("-c") || args[i].equals("-config")) { - - setConfigFile(args[i + 1].trim()); - try { - LOG.info("Parameters read from configuration file: {}", getConfigFile()); - config.readConfigFile(getConfigFile()); - } catch (IOException e) { - throw new RuntimeException(e); - } - break; - } - } - - // Now process all the command-line args - config.processCommandLineOptions(args); - } - } - - /** - * @return the configFile - */ - public String getConfigFile() { - return configFile; - } - - /** - * @param configFile the configFile to set - */ - public void setConfigFile(String configFile) { - this.configFile = configFile; - } -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f7513abb/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java ---------------------------------------------------------------------- diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java deleted file mode 100644 index dec509f..0000000 --- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/SourceDependentFF.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.joshua.decoder.ff; - -import org.apache.joshua.decoder.segment_file.Sentence; - -public interface SourceDependentFF extends Cloneable { - - void setSource(Sentence sentence); - - FeatureFunction clone(); - -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f7513abb/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java ---------------------------------------------------------------------- diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java deleted file mode 100644 index 38bd373..0000000 --- a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.joshua.decoder.ff.similarity; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.PrintWriter; -import java.net.Socket; -import java.net.UnknownHostException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import com.google.common.base.Throwables; - -import org.apache.joshua.corpus.Vocabulary; -import org.apache.joshua.decoder.JoshuaConfiguration; -import org.apache.joshua.decoder.chart_parser.SourcePath; -import org.apache.joshua.decoder.ff.FeatureVector; -import org.apache.joshua.decoder.ff.StatefulFF; -import org.apache.joshua.decoder.ff.SourceDependentFF; -import org.apache.joshua.decoder.ff.state_maintenance.DPState; -import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState; -import org.apache.joshua.decoder.ff.tm.Rule; -import org.apache.joshua.decoder.hypergraph.HGNode; -import org.apache.joshua.decoder.segment_file.Sentence; -import org.apache.joshua.util.Cache; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class EdgePhraseSimilarityFF extends StatefulFF implements SourceDependentFF { - - private static final Logger LOG = LoggerFactory.getLogger(EdgePhraseSimilarityFF.class); - - private static final Cache<String, Float> cache = new Cache<>(100000000); - - private final String host; - private final int port; - - private PrintWriter serverAsk; - private BufferedReader serverReply; - - private int[] source; - - private final int MAX_PHRASE_LENGTH = 4; - - public EdgePhraseSimilarityFF(FeatureVector weights, String[] args, JoshuaConfiguration config) throws NumberFormatException, UnknownHostException, IOException { - super(weights, "EdgePhraseSimilarity", args, config); - - this.host = parsedArgs.get("host"); - this.port = Integer.parseInt(parsedArgs.get("port")); - - initializeConnection(); - } - - private void initializeConnection() throws NumberFormatException, IOException { - LOG.info("Opening connection."); - Socket socket = new Socket(host, port); - serverAsk = new PrintWriter(socket.getOutputStream(), true); - serverReply = new BufferedReader(new InputStreamReader(socket.getInputStream())); - } - - @Override - public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath, - Sentence sentence, Accumulator acc) { - - float value = computeScore(rule, tailNodes); - acc.add(featureId, value); - - // TODO 07/2013: EdgePhraseSimilarity needs to know its order rather than inferring it from tail - // nodes. - return new NgramDPState(new int[1], new int[1]); - } - - @Override - public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath path, Sentence sentence, Accumulator acc) { - return null; - } - - public float computeScore(Rule rule, List<HGNode> tailNodes) { - if (tailNodes == null || tailNodes.isEmpty()) - return 0; - - // System.err.println("RULE [" + spanStart + ", " + spanEnd + "]: " + rule.toString()); - - int[] target = rule.getTarget(); - int lm_state_size = 0; - for (HGNode node : tailNodes) { - NgramDPState state = (NgramDPState) node.getDPState(stateIndex); - lm_state_size += state.getLeftLMStateWords().length + state.getRightLMStateWords().length; - } - - ArrayList<int[]> batch = new ArrayList<>(); - - // Build joined target string. - int[] join = new int[target.length + lm_state_size]; - - int idx = 0, num_gaps = 1, num_anchors = 0; - int[] anchors = new int[rule.getArity() * 2]; - int[] indices = new int[rule.getArity() * 2]; - int[] gaps = new int[rule.getArity() + 2]; - gaps[0] = 0; - for (int t = 0; t < target.length; t++) { - if (target[t] < 0) { - HGNode node = tailNodes.get(-(target[t] + 1)); - if (t != 0) { - indices[num_anchors] = node.i; - anchors[num_anchors++] = idx; - } - NgramDPState state = (NgramDPState) node.getDPState(stateIndex); - // System.err.print("LEFT: "); - // for (int w : state.getLeftLMStateWords()) System.err.print(Vocabulary.word(w) + " "); - // System.err.println(); - for (int w : state.getLeftLMStateWords()) - join[idx++] = w; - int GAP = 0; - join[idx++] = GAP; - gaps[num_gaps++] = idx; - // System.err.print("RIGHT: "); - // for (int w : state.getRightLMStateWords()) System.err.print(Vocabulary.word(w) + " "); - // System.err.println(); - for (int w : state.getRightLMStateWords()) - join[idx++] = w; - if (t != target.length - 1) { - indices[num_anchors] = node.j; - anchors[num_anchors++] = idx; - } - } else { - join[idx++] = target[t]; - } - } - gaps[gaps.length - 1] = join.length + 1; - - // int c = 0; - // System.err.print("> "); - // for (int k = 0; k < join.length; k++) { - // if (c < num_anchors && anchors[c] == k) { - // c++; - // System.err.print("| "); - // } - // System.err.print(Vocabulary.word(join[k]) + " "); - // } - // System.err.println("<"); - - int g = 0; - for (int a = 0; a < num_anchors; a++) { - if (a > 0 && anchors[a - 1] == anchors[a]) - continue; - if (anchors[a] > gaps[g + 1]) - g++; - int left = Math.max(gaps[g], anchors[a] - MAX_PHRASE_LENGTH + 1); - int right = Math.min(gaps[g + 1] - 1, anchors[a] + MAX_PHRASE_LENGTH - 1); - - int[] target_phrase = new int[right - left]; - System.arraycopy(join, left, target_phrase, 0, target_phrase.length); - int[] source_phrase = getSourcePhrase(indices[a]); - - if (source_phrase != null && target_phrase.length != 0) { - // System.err.println("ANCHOR: " + indices[a]); - batch.add(source_phrase); - batch.add(target_phrase); - } - } - return getSimilarity(batch); - } - - @Override - public float estimateFutureCost(Rule rule, DPState currentState, Sentence sentence) { - return 0.0f; - } - - /** - * From SourceDependentFF interface. - */ - @Override - public void setSource(Sentence sentence) { - if (! sentence.isLinearChain()) - throw new RuntimeException("EdgePhraseSimilarity not defined for lattices"); - this.source = sentence.getWordIDs(); - } - - public EdgePhraseSimilarityFF clone() { - try { - return new EdgePhraseSimilarityFF(this.weights, args, config); - } catch (Exception e) { - throw Throwables.propagate(e); - } - } - - @Override - public float estimateCost(Rule rule, Sentence sentence) { - return 0.0f; - } - - private int[] getSourcePhrase(int anchor) { - int idx; - int length = Math.min(anchor, MAX_PHRASE_LENGTH - 1) - + Math.min(source.length - anchor, MAX_PHRASE_LENGTH - 1); - if (length <= 0) - return null; - int[] phrase = new int[length]; - idx = 0; - for (int p = Math.max(0, anchor - MAX_PHRASE_LENGTH + 1); p < Math.min(source.length, anchor - + MAX_PHRASE_LENGTH - 1); p++) - phrase[idx++] = source[p]; - return phrase; - } - - private float getSimilarity(List<int[]> batch) { - float similarity = 0.0f; - int count = 0; - StringBuilder query = new StringBuilder(); - List<String> to_cache = new ArrayList<>(); - query.append("xb"); - for (int i = 0; i < batch.size(); i += 2) { - int[] source = batch.get(i); - int[] target = batch.get(i + 1); - - if (Arrays.equals(source, target)) { - similarity += 1; - count++; - } else { - String source_string = Vocabulary.getWords(source); - String target_string = Vocabulary.getWords(target); - - String both; - if (source_string.compareTo(target_string) > 0) - both = source_string + " ||| " + target_string; - else - both = target_string + " ||| " + source_string; - - Float cached = cache.get(both); - if (cached != null) { - // System.err.println("SIM: " + source_string + " X " + target_string + " = " + cached); - similarity += cached; - count++; - } else { - query.append("\t").append(source_string); - query.append("\t").append(target_string); - to_cache.add(both); - } - } - } - if (!to_cache.isEmpty()) { - try { - serverAsk.println(query.toString()); - String response = serverReply.readLine(); - String[] scores = response.split("\\s+"); - for (int i = 0; i < scores.length; i++) { - Float score = Float.parseFloat(scores[i]); - cache.put(to_cache.get(i), score); - similarity += score; - count++; - } - } catch (Exception e) { - return 0; - } - } - return (count == 0 ? 0 : similarity / count); - } -}