Decoder's Translation class now contains more members including the possibility to store word alignment from the derivation. Allows use of Joshua decoder class in a larger code project to extract information, rather than relying on stdout. Also added a getter for JoshuaConfiguration in the Decoder.
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/9501535d Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/9501535d Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/9501535d Branch: refs/heads/master Commit: 9501535dcd67b89e821fd686089f621c5721497f Parents: e70677d Author: Felix Hieber <[email protected]> Authored: Fri Feb 27 14:05:50 2015 +0100 Committer: Kellen Sunderland <[email protected]> Committed: Thu Mar 31 10:44:42 2016 +0200 ---------------------------------------------------------------------- .gitignore | 6 + resources/grammar.glue | 4 + resources/wa_grammar | 3 + src/joshua/decoder/Decoder.java | 4 + src/joshua/decoder/JoshuaConfiguration.java | 3 + src/joshua/decoder/Translation.java | 122 ++++++++++++--- src/joshua/decoder/ff/tm/Rule.java | 36 +++++ .../decoder/hypergraph/AlignedSourceTokens.java | 93 +++++++++++ .../decoder/hypergraph/KBestExtractor.java | 10 ++ .../decoder/hypergraph/ViterbiExtractor.java | 30 ++++ .../hypergraph/WordAlignmentExtractor.java | 55 +++++++ .../decoder/hypergraph/WordAlignmentState.java | 154 +++++++++++++++++++ tst/joshua/system/AlignmentMapTest.java | 53 +++++++ tst/joshua/system/StructuredOutputTest.java | 103 +++++++++++++ 14 files changed, 658 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 869300e..1238e15 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,9 @@ bin *~ .philog.LOGFILE +build +**.history +.settings +/eclipse-bin/ +.classpath +/target/ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/resources/grammar.glue ---------------------------------------------------------------------- diff --git a/resources/grammar.glue b/resources/grammar.glue new file mode 100644 index 0000000..69e1520 --- /dev/null +++ b/resources/grammar.glue @@ -0,0 +1,4 @@ +[GOAL] ||| <s> ||| <s> ||| 0 +[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1 +[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0 +[GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0 http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/resources/wa_grammar ---------------------------------------------------------------------- diff --git a/resources/wa_grammar b/resources/wa_grammar new file mode 100644 index 0000000..08c2c38 --- /dev/null +++ b/resources/wa_grammar @@ -0,0 +1,3 @@ +[X] ||| A [X,1] B1 [X,2] B2 C ||| a b [X,2] c1 [X,1] c2 ||| 1 1 1 1 1 1 ||| 0-0 2-1 4-1 5-3 5-5 +[X] ||| U Z1 Z2 ||| n1 u z ||| 1 1 1 1 1 1 ||| 0-1 1-2 2-2 +[X] ||| K ||| k1 k2 k3 n1 n2 n3 ||| 1 1 1 1 1 1 ||| 0-0 0-1 0-2 http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/Decoder.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java index a9d7ba9..1a353ca 100644 --- a/src/joshua/decoder/Decoder.java +++ b/src/joshua/decoder/Decoder.java @@ -64,6 +64,10 @@ public class Decoder { private final JoshuaConfiguration joshuaConfiguration; + public JoshuaConfiguration getJoshuaConfiguration() { + return joshuaConfiguration; + } + /* * Many of these objects themselves are global objects. We pass them in when constructing other * objects, so that they all share pointers to the same object. This is good because it reduces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/JoshuaConfiguration.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java index 49ab87d..266198c 100644 --- a/src/joshua/decoder/JoshuaConfiguration.java +++ b/src/joshua/decoder/JoshuaConfiguration.java @@ -29,6 +29,9 @@ import joshua.util.io.LineReader; * @author Matt Post <[email protected]> */ public class JoshuaConfiguration { + + // whether to use structured output + public Boolean use_structured_output = false; // List of grammar files to read public ArrayList<String> tms = new ArrayList<String>(); http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/Translation.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/Translation.java b/src/joshua/decoder/Translation.java index fbf1571..a9f82f7 100644 --- a/src/joshua/decoder/Translation.java +++ b/src/joshua/decoder/Translation.java @@ -3,6 +3,7 @@ package joshua.decoder; import java.io.BufferedWriter; import java.io.IOException; import java.io.StringWriter; +import java.util.Arrays; import java.util.List; import joshua.decoder.ff.FeatureFunction; @@ -10,6 +11,7 @@ import joshua.decoder.ff.lm.StateMinimizingLanguageModel; import joshua.decoder.hypergraph.HyperGraph; import joshua.decoder.hypergraph.KBestExtractor; import joshua.decoder.hypergraph.ViterbiExtractor; +import joshua.decoder.hypergraph.WordAlignmentState; import joshua.decoder.io.DeNormalize; import joshua.decoder.segment_file.Sentence; @@ -39,20 +41,31 @@ public class Translation { return rawTranslation; } + private WordAlignmentState alignment = null; + private float score = 0; + private float translationTime; + public Translation(Sentence source, HyperGraph hypergraph, List<FeatureFunction> featureFunctions, JoshuaConfiguration joshuaConfiguration) { this.source = source; + + if (joshuaConfiguration.use_structured_output) { + + // create structured output instead of the String manipulation below. + createStructuredOutput(source, hypergraph); + + } else { + + StringWriter sw = new StringWriter(); + BufferedWriter out = new BufferedWriter(sw); + + try { + if (hypergraph != null) { + if (!joshuaConfiguration.hypergraphFilePattern.equals("")) { + hypergraph.dump(String.format(joshuaConfiguration.hypergraphFilePattern, source.id()), featureFunctions); + } - StringWriter sw = new StringWriter(); - BufferedWriter out = new BufferedWriter(sw); - - try { - if (hypergraph != null) { - if (!joshuaConfiguration.hypergraphFilePattern.equals("")) { - hypergraph.dump(String.format(joshuaConfiguration.hypergraphFilePattern, source.id()), featureFunctions); - } - - long startTime = System.currentTimeMillis(); + long startTime = System.currentTimeMillis(); // We must put this weight as zero, otherwise we get an error when we try to retrieve it // without checking @@ -75,9 +88,15 @@ public class Translation { .replace("%S", DeNormalize.processSingleLine(rawTranslation)) .replace("%c", String.format("%.3f", hypergraph.goalNode.getScore())) .replace("%i", String.format("%d", source.id())); + + /* %a causes output of word level alignments between input and output hypothesis */ + if (joshuaConfiguration.outputFormat.contains("%a")) { + translation = translation.replace("%a", ViterbiExtractor.extractViterbiAlignment(hypergraph.goalNode)); + } out.write(translation); out.newLine(); + } else { KBestExtractor kBestExtractor = new KBestExtractor(source, featureFunctions, Decoder.weights, false, joshuaConfiguration); kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out); @@ -91,9 +110,11 @@ public class Translation { } } - float seconds = (float) (System.currentTimeMillis() - startTime) / 1000.0f; - Decoder.LOG(1, String.format("Input %d: %d-best extraction took %.3f seconds", id(), - joshuaConfiguration.topN, seconds)); + float seconds = (float) (System.currentTimeMillis() - startTime) / 1000.0f; + Decoder.LOG(1, String.format("Input %d: %d-best extraction took %.3f seconds", id(), + joshuaConfiguration.topN, seconds)); + this.translationTime = seconds; + } else { @@ -113,10 +134,14 @@ public class Translation { out.newLine(); } - out.flush(); - } catch (IOException e) { - e.printStackTrace(); - System.exit(1); + out.flush(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + + this.output = sw.toString(); + } /* @@ -129,8 +154,40 @@ public class Translation { break; } } + + } - this.output = sw.toString(); + /** + * Instead of returning a single string with output information appended + * (if JoshuaConfig.use_structured_output == false), + * write Viterbi information (score, translation, word alignment) to member + * variables for easier access from outside pipelines. + */ + private void createStructuredOutput(Sentence source, HyperGraph hypergraph) { + + this.translationTime = 0; + + long startTime = System.currentTimeMillis(); + + if (hypergraph != null) { + + this.output = ViterbiExtractor.extractViterbiString(hypergraph.goalNode).trim(); + // trims whitespaces (same idiom as in existing Joshua code (65) + this.output = this.output.substring(this.output.indexOf(' ') + 1, this.output.lastIndexOf(' ')); + this.alignment = ViterbiExtractor.buildViterbiAlignment(hypergraph.goalNode); + this.score = hypergraph.goalNode.getScore(); + + } else { + + this.output = this.source.source(); + this.alignment = null; + + } + + this.translationTime = (System.currentTimeMillis() - startTime) / 1000.0f; + + Decoder.LOG(1, String.format("Translation %d: %.3f %s (%.3f)", source.id(), hypergraph.goalNode.getScore(), this.output, this.translationTime)); + } public Sentence getSourceSentence() { @@ -145,4 +202,33 @@ public class Translation { public String toString() { return output; } + + public float getTranslationTime() { + return translationTime; + } + + public String getTranslationString() { + return (output == null) ? "" : output.trim(); + } + + public List<List<Integer>> getWordAlignment() { + return alignment.toFinalList(); + } + + public String getWordAlignmentString() { + return (alignment == null) ? "" : alignment.toFinalString(); + } + + public float getTranslationScore() { + return score; + } + + public List<String> getTranslationTokens() { + return Arrays.asList(getTranslationString().split("\\s+")); + } + + public String getDeNormalizedTranslation() { + return DeNormalize.processSingleLine(getTranslationString()); + } + } http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/ff/tm/Rule.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/ff/tm/Rule.java b/src/joshua/decoder/ff/tm/Rule.java index 2f3ec47..90a44d5 100644 --- a/src/joshua/decoder/ff/tm/Rule.java +++ b/src/joshua/decoder/ff/tm/Rule.java @@ -1,8 +1,11 @@ package joshua.decoder.ff.tm; +import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.regex.Pattern; import joshua.corpus.Vocabulary; @@ -411,6 +414,39 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> { nts[index++] = -id; return nts; } + + /** + * Returns an array of size getArity() containing the source indeces of non terminals. + */ + public int[] getNonTerminalSourcePositions() { + int[] nonTerminalPositions = new int[getArity()]; + int ntPos = 0; + for (int sourceIdx = 0; sourceIdx < getFrench().length; sourceIdx++) { + if (getFrench()[sourceIdx] < 0) + nonTerminalPositions[ntPos++] = sourceIdx; + } + return nonTerminalPositions; + } + + /** + * Parses the Alignment byte[] into a Map from target to (possibly a list of) source positions. + * Used by the WordAlignmentExtractor. + */ + public Map<Integer, List<Integer>> getAlignmentMap() { + byte[] alignmentArray = getAlignment(); + Map<Integer, List<Integer>> alignmentMap = new HashMap<Integer, List<Integer>>(); + if (alignmentArray != null) { + for (int alignmentIdx = 0; alignmentIdx < alignmentArray.length; alignmentIdx += 2 ) { + int s = alignmentArray[alignmentIdx]; + int t = alignmentArray[alignmentIdx + 1]; + List<Integer> values = alignmentMap.get(t); + if (values == null) + alignmentMap.put(t, values = new ArrayList<Integer>()); + values.add(s); + } + } + return alignmentMap; + } /** * Return the English (target) nonterminals as list of Strings http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/hypergraph/AlignedSourceTokens.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/hypergraph/AlignedSourceTokens.java b/src/joshua/decoder/hypergraph/AlignedSourceTokens.java new file mode 100644 index 0000000..eb92f0d --- /dev/null +++ b/src/joshua/decoder/hypergraph/AlignedSourceTokens.java @@ -0,0 +1,93 @@ +package joshua.decoder.hypergraph; + +import java.util.LinkedList; +import java.util.ListIterator; + +/** + * Class that represents a one to (possibly) many alignment from target to + * source. Extends from a LinkedList. Instances of this class are updated by the + * WordAlignmentExtractor.substitute() method. The <shiftBy> method shifts the + * elements in the list by a scalar to reflect substitutions of non terminals in + * the rule. if indexes are final, i.e. the point instance has been substituted + * into a parent WordAlignmentState once, <isFinal> is set to true. This is + * necessary since the final source index of a point is known once we have + * substituted in a complete WordAlignmentState into its parent. If the index in + * the list is a non terminal, <isNonTerminal> = true + */ +class AlignedSourceTokens extends LinkedList<Integer> { + + private static final long serialVersionUID = 1L; + /** whether this Point refers to a non terminal in source&target */ + private boolean isNonTerminal = false; + /** whether this instance does not need to be updated anymore */ + private boolean isFinal = false; + /** whether the word this Point corresponds to has no alignment in source */ + private boolean isNull = false; + + AlignedSourceTokens() { + } + + void setFinal() { + isFinal = true; + } + + void setNonTerminal() { + isNonTerminal = true; + } + + void setNull() { + isNull = true; + } + + @Override + /** + * returns true if element was added. + */ + public boolean add(Integer x) { + if (isNull || isNonTerminal) + return false; + return super.add(x); + } + + public boolean isNonTerminal() { + return isNonTerminal; + } + + public boolean isFinal() { + return isFinal; + } + + public boolean isNull() { + return isNull; + } + + /** + * shifts each item in the LinkedList by <shift>. + * Only applies to items larger than <start> + */ + void shiftBy(int start, int shift) { + if (!isFinal && !isNull) { + ListIterator<Integer> it = this.listIterator(); + while (it.hasNext()) { + int x = it.next(); + if (x > start) { + it.set(x + shift); + } + } + } + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + if (isFinal) + sb.append("f"); + if (isNull) { + sb.append("[NULL]"); + } else { + sb.append(super.toString()); + } + if (isNonTerminal) + sb.append("^"); + return sb.toString(); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/hypergraph/KBestExtractor.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java index 78cf6fe..46ab9ea 100644 --- a/src/joshua/decoder/hypergraph/KBestExtractor.java +++ b/src/joshua/decoder/hypergraph/KBestExtractor.java @@ -188,6 +188,12 @@ public class KBestExtractor { if (joshuaConfiguration.outputFormat.contains("%d")) { outputString = outputString.replace("%d", derivationState.getDerivation()); } + + /* %a causes output of word level alignments between input and output hypothesis */ + if (joshuaConfiguration.outputFormat.contains("%a")) { + outputString = outputString.replace("%a", derivationState.getWordAlignment()); + } + } return outputString; @@ -581,6 +587,10 @@ public class KBestExtractor { bleu = 0.0f; } + public String getWordAlignment() { + return visit(new WordAlignmentExtractor()).toString(); + } + /** * Computes a scaled approximate BLEU from the accumulated statistics. We know the number of * words; to compute the effective reference length, we take the real reference length statistic http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/hypergraph/ViterbiExtractor.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/hypergraph/ViterbiExtractor.java b/src/joshua/decoder/hypergraph/ViterbiExtractor.java index e19bb3f..95ec7c7 100644 --- a/src/joshua/decoder/hypergraph/ViterbiExtractor.java +++ b/src/joshua/decoder/hypergraph/ViterbiExtractor.java @@ -39,6 +39,36 @@ public class ViterbiExtractor { } return res.toString(); } + + public static String extractViterbiAlignment(HGNode node) { + WordAlignmentState viterbiAlignment = buildViterbiAlignment(node); + return viterbiAlignment.toFinalString(); + } + + // get one-best alignment for Viterbi string + public static WordAlignmentState buildViterbiAlignment(HGNode node) { + HyperEdge edge = node.bestHyperedge; + Rule rl = edge.getRule(); + if (rl == null) { // deductions under "goal item" does not have rule + if (edge.getTailNodes().size() != 1) + throw new RuntimeException("deduction under goal item have not equal one item"); + return buildViterbiAlignment(edge.getTailNodes().get(0)); + } + WordAlignmentState waState = new WordAlignmentState(rl, node.i); + if (edge.getTailNodes() != null) { + int[] english = rl.getEnglish(); + for (int c = 0; c < english.length; c++) { + if (Vocabulary.nt(english[c])) { + // determines the index in the tail node array by + // the index of the nonterminal in the source [english[c] gives a negative + // int] + int index = -(english[c] + 1); + waState.substituteIn(buildViterbiAlignment(edge.getTailNodes().get(index))); + } + } + } + return waState; + } // ######## find 1best hypergraph############# public static HyperGraph getViterbiTreeHG(HyperGraph hg_in) { http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java new file mode 100644 index 0000000..63619ee --- /dev/null +++ b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java @@ -0,0 +1,55 @@ +package joshua.decoder.hypergraph; + +import java.util.Stack; + +import joshua.corpus.Vocabulary; +import joshua.decoder.Decoder; +import joshua.decoder.ff.tm.Rule; +import joshua.decoder.hypergraph.KBestExtractor.DerivationState; +import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor; + +/** + * this class implements Joshua's Derivation Visitor interface. + * before() and after() methods are called at each visit of a rule in + * the hypergraph. + * We place WordAlignmentStates on a stack and merge/substitute them into each + * other if possible. At the end, the remaining last state on the stack + * should be complete (no NonTerminals to substitute anymore). + */ +public class WordAlignmentExtractor implements DerivationVisitor { + + private Stack<WordAlignmentState> stack; + + public WordAlignmentExtractor() { + stack = new Stack<WordAlignmentState>(); + } + + void merge(WordAlignmentState astate) { + // if alignment state has no NTs left AND stack is not empty + // AND parent object on stack still needs something to substitute + if (astate.isComplete() && stack.size() > 0 && !stack.peek().isComplete()) { + WordAlignmentState parentState = stack.pop(); + parentState.substituteIn(astate); + merge(parentState); + } else { + stack.add(astate); + } + } + + @Override + public void before(DerivationState state, int level) { + Rule rule = state.edge.getRule(); + if (rule != null) { + merge(new WordAlignmentState(rule, state.parentNode.i)); + } + } + + @Override + public void after(DerivationState state, int level) { + } + + public String toString() { + WordAlignmentState finalState = stack.pop(); + return finalState.toFinalString(); + } +} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/hypergraph/WordAlignmentState.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/hypergraph/WordAlignmentState.java b/src/joshua/decoder/hypergraph/WordAlignmentState.java new file mode 100644 index 0000000..e3b9598 --- /dev/null +++ b/src/joshua/decoder/hypergraph/WordAlignmentState.java @@ -0,0 +1,154 @@ +package joshua.decoder.hypergraph; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; +import java.util.Map; + +import joshua.decoder.ff.tm.Rule; + +/** + * This class encodes a derivation state in terms of a list of alignment points. + * Whenever a child instance is substituted into the parent instance, we need to + * adjust source indexes of the alignments. + * + * @author fhieber + */ +public class WordAlignmentState { + + /** + * each element in this list corresponds to a token on the target side of the + * rule. The values of the elements correspond to the aligned source token on + * the source side of the rule. + */ + private LinkedList<AlignedSourceTokens> trgPoints; + private int srcStart; + /** number of NTs we need to substitute. */ + private int numNT; + /** grows with substitutions of child rules. Reaches original Rule span if substitutions are complete */ + private int srcLength; + + /** + * construct AlignmentState object from a virgin Rule and its source span. + * Determines if state is complete (if no NT present) + */ + WordAlignmentState(Rule rule, int start) { + trgPoints = new LinkedList<AlignedSourceTokens>(); + srcLength = rule.getFrench().length; + numNT = rule.getArity(); + srcStart = start; + Map<Integer, List<Integer>> alignmentMap = rule.getAlignmentMap(); + int[] nonTermPositions = rule.getNonTerminalSourcePositions(); + int[] trg = rule.getEnglish(); + // for each target index, create a TargetAlignmentPoint + for (int trgIndex = 0; trgIndex < trg.length; trgIndex++) { + AlignedSourceTokens trgPoint = new AlignedSourceTokens(); + + if (trg[trgIndex] >= 0) { // this is a terminal symbol, check for alignment + if (alignmentMap.containsKey(trgIndex)) { + // add source indexes to TargetAlignmentPoint + for (int srcIdx : alignmentMap.get(trgIndex)) { + trgPoint.add(srcStart + srcIdx); + } + } else { // this target word is NULL-aligned + trgPoint.setNull(); + } + } else { // this is a nonterminal ([X]) [actually its the (negative) index of the NT in the source + trgPoint.setNonTerminal(); + trgPoint.add(srcStart + nonTermPositions[Math.abs(trg[trgIndex]) - 1]); + } + trgPoints.add(trgPoint); + } + } + + /** + * if there are no more NonTerminals to substitute, + * this state is said to be complete + */ + public boolean isComplete() { + return numNT == 0; + } + + /** + * builds the final alignment string in the standard alignment format: src - + * trg. Sorted by trg indexes. Disregards the sentence markers. + */ + public String toFinalString() { + StringBuilder sb = new StringBuilder(); + int t = 0; + for (AlignedSourceTokens pt : trgPoints) { + for (int s : pt) + sb.append(String.format(" %d-%d", s-1, t-1)); // disregard sentence + // markers + t++; + } + String result = sb.toString(); + if (!result.isEmpty()) + return result.substring(1); + return result; + } + + /** + * builds the final alignment list. + * each entry in the list corresponds to a list of aligned source tokens. + * First and last item in trgPoints is skipped. + */ + public List<List<Integer>> toFinalList() { + assert (isComplete() == true); + List<List<Integer>> alignment = new ArrayList<List<Integer>> (); + if (trgPoints.isEmpty()) + return alignment; + ListIterator<AlignedSourceTokens> it = trgPoints.listIterator(); + it.next(); // skip first item (sentence marker) + while (it.hasNext()) { + AlignedSourceTokens alignedSourceTokens = it.next(); + if (it.hasNext()) { // if not last element in trgPoints + List<Integer> newAlignedSourceTokens = new ArrayList<Integer>(); + for (Integer sourceIndex : alignedSourceTokens) + newAlignedSourceTokens.add(sourceIndex - 1); // shift by one to disregard sentence marker + alignment.add(newAlignedSourceTokens); + } + } + return alignment; + } + + /** + * String representation for debugging. + */ + public String toString() { + return String.format("%s , len=%d start=%d, isComplete=%s", + trgPoints.toString(), srcLength, srcStart, this.isComplete()); + } + + /** + * substitutes a child WorldAlignmentState into this instance at the first + * NT it finds. Also shifts the indeces in this instance by the span/width of the + * child that is to be substituted. + * Substitution order is determined by the architecture of Joshua's hypergraph. + */ + void substituteIn(WordAlignmentState child) { + // update existing indexes by length of child (has no effect on NULL and + // NonTerminal points) + for (AlignedSourceTokens trgPoint : trgPoints) + trgPoint.shiftBy(child.srcStart, child.srcLength - 1); + + // now substitute in the child at first NT, modifying the list + ListIterator<AlignedSourceTokens> it = trgPoints.listIterator(); + while (it.hasNext()) { + AlignedSourceTokens trgPoint = it.next(); + if (trgPoint.isNonTerminal()) { // found first NT + it.remove(); // remove NT symbol + for (AlignedSourceTokens childElement : child.trgPoints) { + childElement.setFinal(); // child source indexes are final, do not change them anymore + it.add(childElement); + } + this.srcLength += child.srcLength - 1; // -1 (NT) + this.numNT--; + break; + } + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/tst/joshua/system/AlignmentMapTest.java ---------------------------------------------------------------------- diff --git a/tst/joshua/system/AlignmentMapTest.java b/tst/joshua/system/AlignmentMapTest.java new file mode 100644 index 0000000..0eee8c8 --- /dev/null +++ b/tst/joshua/system/AlignmentMapTest.java @@ -0,0 +1,53 @@ +package joshua.system; + +import static org.junit.Assert.*; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import joshua.corpus.Vocabulary; +import joshua.decoder.ff.tm.Rule; + +import org.junit.Before; +import org.junit.Test; + +public class AlignmentMapTest { + + private Rule rule1 = null; + private Rule rule2 = null; + private static Map<Integer, List<Integer>> expectedAlignmentMap = null; + private static final int[] expectedNonTerminalPositions = {2,5}; + + @Before + public void setUp() throws Exception { + int[] sourceRhs = {Vocabulary.id("A1"),Vocabulary.id("A2"),-1,Vocabulary.id("B"),Vocabulary.id("C"),-2}; + int[] targetRhs = {Vocabulary.id("c"),Vocabulary.id("b1"),-1,Vocabulary.id("b2"),-4,Vocabulary.id("a")}; + int arity = 2; // 2 non terminals + String alignment = "0-5 1-5 3-1 3-3 4-0"; + expectedAlignmentMap = new HashMap<Integer, List<Integer>>(); + expectedAlignmentMap.put(0, Arrays.asList(4)); + expectedAlignmentMap.put(5, Arrays.asList(0,1)); + expectedAlignmentMap.put(1, Arrays.asList(3)); + expectedAlignmentMap.put(3, Arrays.asList(3)); + rule1 = new Rule(-1, sourceRhs, targetRhs, "", arity, alignment); + rule2 = new Rule(-1, sourceRhs, targetRhs, "", arity, null); // rule with no alignment + } + + @Test + public void test() { + // test regular rule with arity 2 + Map<Integer, List<Integer>> alignmentMap1 = rule1.getAlignmentMap(); + assertEquals(expectedAlignmentMap, alignmentMap1); + int[] nonTerminalPositions1 = rule1.getNonTerminalSourcePositions(); + assertArrayEquals(expectedNonTerminalPositions, nonTerminalPositions1); + + // test rule with no alignment + Map<Integer, List<Integer>> alignmentMap2 = rule2.getAlignmentMap(); + assertTrue(alignmentMap2.isEmpty()); + int[] nonTerminalPositions2 = rule2.getNonTerminalSourcePositions(); + assertArrayEquals(expectedNonTerminalPositions, nonTerminalPositions2); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/tst/joshua/system/StructuredOutputTest.java ---------------------------------------------------------------------- diff --git a/tst/joshua/system/StructuredOutputTest.java b/tst/joshua/system/StructuredOutputTest.java new file mode 100644 index 0000000..981f9d8 --- /dev/null +++ b/tst/joshua/system/StructuredOutputTest.java @@ -0,0 +1,103 @@ +package joshua.system; + +import java.util.Arrays; +import java.util.List; + +import joshua.decoder.Decoder; +import joshua.decoder.JoshuaConfiguration; +import joshua.decoder.Translation; +import joshua.decoder.segment_file.Sentence; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.Assert; + +/** + * Integration test for the complete Joshua decoder using a toy grammar that translates + * a bunch of capital letters to lowercase letters. Rules in the test grammar + * drop and generate additional words and simulate reordering of rules, so that + * proper extraction of word alignments can be tested. + * + * @author fhieber + */ +public class StructuredOutputTest { + + private JoshuaConfiguration joshuaConfig = null; + private Decoder decoder = null; + private Translation translation = null; + private static final String input = "A K B1 U Z1 Z2 B2 C"; + private static final String expectedTranslation = "a b n1 u z c1 k1 k2 k3 n1 n2 n3 c2"; + private static final String expectedWordAlignmentString = "0-0 2-1 6-1 3-3 4-4 5-4 7-5 1-6 1-7 1-8 7-12"; + private static final List<List<Integer>> expectedWordAlignment = Arrays.asList( + Arrays.asList(0), Arrays.asList(2, 6), Arrays.asList(), Arrays.asList(3), + Arrays.asList(4, 5), Arrays.asList(7), Arrays.asList(1), + Arrays.asList(1), Arrays.asList(1), Arrays.asList(), Arrays.asList(), + Arrays.asList(), Arrays.asList(7)); + private static final double expectedScore = -17.0; + + @Before + public void setUp() throws Exception { + joshuaConfig = new JoshuaConfiguration(); + joshuaConfig.search_algorithm = "cky"; + joshuaConfig.mark_oovs = false; + joshuaConfig.pop_limit = 100; + joshuaConfig.use_unique_nbest = false; + joshuaConfig.include_align_index = false; + joshuaConfig.topN = 0; + joshuaConfig.tms.add("thrax pt 20 resources/wa_grammar"); + joshuaConfig.tms.add("thrax glue -1 resources/grammar.glue"); + joshuaConfig.goal_symbol = "[GOAL]"; + joshuaConfig.default_non_terminal = "[X]"; + joshuaConfig.features.add("feature_function = OOVPenalty"); + joshuaConfig.weights.add("tm_pt_0 1"); + joshuaConfig.weights.add("tm_pt_1 1"); + joshuaConfig.weights.add("tm_pt_2 1"); + joshuaConfig.weights.add("tm_pt_3 1"); + joshuaConfig.weights.add("tm_pt_4 1"); + joshuaConfig.weights.add("tm_pt_5 1"); + joshuaConfig.weights.add("tm_glue_0 1"); + joshuaConfig.weights.add("OOVPenalty 2"); + decoder = new Decoder(joshuaConfig, ""); // second argument (configFile + // is not even used by the + // constructor/initialize) + } + + @After + public void tearDown() throws Exception { + decoder.cleanUp(); + decoder = null; + translation = null; + } + + private Translation decode(String input) { + Sentence sentence = new Sentence(input, 0, joshuaConfig); + return decoder.decode(sentence); + } + + @Test + public void test() { + + // test standard output + joshuaConfig.use_structured_output = false; + joshuaConfig.outputFormat = "%s | %a "; + translation = decode(input); + Assert.assertEquals(expectedTranslation + " | " + + expectedWordAlignmentString, translation.toString().trim()); + + // test structured output + joshuaConfig.use_structured_output = true; // set structured output creation to true + translation = decode(input); + Assert + .assertEquals(expectedTranslation, translation.getTranslationString()); + Assert.assertEquals(Arrays.asList(expectedTranslation.split("\\s+")), + translation.getTranslationTokens()); + Assert.assertEquals(expectedScore, translation.getTranslationScore(), + 0.00001); + Assert.assertEquals(expectedWordAlignment, translation.getWordAlignment()); + Assert.assertEquals(translation.getWordAlignment().size(), translation + .getTranslationTokens().size()); + + } + +}
