[02/10] incubator-joshua git commit: Decoder's Translation class now contains more members including the possibility to store word alignment from the derivation. Allows use of Joshua decoder class in a larger code project to extract information, rather t

mjpost Fri, 01 Apr 2016 15:57:08 -0700

Decoder's Translation class now contains more members including the possibility 
to store word alignment from the derivation. Allows use of Joshua decoder class 
in a larger code project to extract information, rather than relying on stdout. 
Also added a getter for JoshuaConfiguration in the Decoder.



Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/9501535d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/9501535d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/9501535d

Branch: refs/heads/master
Commit: 9501535dcd67b89e821fd686089f621c5721497f
Parents: e70677d
Author: Felix Hieber <[email protected]>
Authored: Fri Feb 27 14:05:50 2015 +0100
Committer: Kellen Sunderland <[email protected]>
Committed: Thu Mar 31 10:44:42 2016 +0200

----------------------------------------------------------------------
 .gitignore                                      |   6 +
 resources/grammar.glue                          |   4 +
 resources/wa_grammar                            |   3 +
 src/joshua/decoder/Decoder.java                 |   4 +
 src/joshua/decoder/JoshuaConfiguration.java     |   3 +
 src/joshua/decoder/Translation.java             | 122 ++++++++++++---
 src/joshua/decoder/ff/tm/Rule.java              |  36 +++++
 .../decoder/hypergraph/AlignedSourceTokens.java |  93 +++++++++++
 .../decoder/hypergraph/KBestExtractor.java      |  10 ++
 .../decoder/hypergraph/ViterbiExtractor.java    |  30 ++++
 .../hypergraph/WordAlignmentExtractor.java      |  55 +++++++
 .../decoder/hypergraph/WordAlignmentState.java  | 154 +++++++++++++++++++
 tst/joshua/system/AlignmentMapTest.java         |  53 +++++++
 tst/joshua/system/StructuredOutputTest.java     | 103 +++++++++++++
 14 files changed, 658 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 869300e..1238e15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,9 @@ bin
 *~
 
 .philog.LOGFILE
+build
+**.history
+.settings
+/eclipse-bin/
+.classpath
+/target/

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/resources/grammar.glue
----------------------------------------------------------------------
diff --git a/resources/grammar.glue b/resources/grammar.glue
new file mode 100644
index 0000000..69e1520
--- /dev/null
+++ b/resources/grammar.glue
@@ -0,0 +1,4 @@
+[GOAL] ||| <s> ||| <s> ||| 0
+[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
+[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
+[GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/resources/wa_grammar
----------------------------------------------------------------------
diff --git a/resources/wa_grammar b/resources/wa_grammar
new file mode 100644
index 0000000..08c2c38
--- /dev/null
+++ b/resources/wa_grammar
@@ -0,0 +1,3 @@
+[X] ||| A [X,1] B1 [X,2] B2 C ||| a b [X,2] c1 [X,1] c2 ||| 1 1 1 1 1 1 ||| 
0-0 2-1 4-1 5-3 5-5
+[X] ||| U Z1 Z2 ||| n1 u z ||| 1 1 1 1 1 1 ||| 0-1 1-2 2-2
+[X] ||| K ||| k1 k2 k3 n1 n2 n3 ||| 1 1 1 1 1 1 ||| 0-0 0-1 0-2

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index a9d7ba9..1a353ca 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -64,6 +64,10 @@ public class Decoder {
 
   private final JoshuaConfiguration joshuaConfiguration;
 
+  public JoshuaConfiguration getJoshuaConfiguration() {
+    return joshuaConfiguration;
+  }
+
   /*
    * Many of these objects themselves are global objects. We pass them in when 
constructing other
    * objects, so that they all share pointers to the same object. This is good 
because it reduces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java 
b/src/joshua/decoder/JoshuaConfiguration.java
index 49ab87d..266198c 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -29,6 +29,9 @@ import joshua.util.io.LineReader;
  * @author Matt Post <[email protected]>
  */
 public class JoshuaConfiguration {
+  
+  // whether to use structured output
+  public Boolean use_structured_output = false;
 
   // List of grammar files to read
   public ArrayList<String> tms = new ArrayList<String>();

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/Translation.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Translation.java 
b/src/joshua/decoder/Translation.java
index fbf1571..a9f82f7 100644
--- a/src/joshua/decoder/Translation.java
+++ b/src/joshua/decoder/Translation.java
@@ -3,6 +3,7 @@ package joshua.decoder;
 import java.io.BufferedWriter;
 import java.io.IOException;
 import java.io.StringWriter;
+import java.util.Arrays;
 import java.util.List;
 
 import joshua.decoder.ff.FeatureFunction;
@@ -10,6 +11,7 @@ import joshua.decoder.ff.lm.StateMinimizingLanguageModel;
 import joshua.decoder.hypergraph.HyperGraph;
 import joshua.decoder.hypergraph.KBestExtractor;
 import joshua.decoder.hypergraph.ViterbiExtractor;
+import joshua.decoder.hypergraph.WordAlignmentState;
 import joshua.decoder.io.DeNormalize;
 import joshua.decoder.segment_file.Sentence;
 
@@ -39,20 +41,31 @@ public class Translation {
     return rawTranslation;
   }
 
+  private WordAlignmentState alignment = null;
+  private float score = 0;
+  private float translationTime;
+  
   public Translation(Sentence source, HyperGraph hypergraph, 
       List<FeatureFunction> featureFunctions, JoshuaConfiguration 
joshuaConfiguration) {
     this.source = source;
+    
+    if (joshuaConfiguration.use_structured_output) {
+      
+      // create structured output instead of the String manipulation below.
+      createStructuredOutput(source, hypergraph);
+      
+    } else {
+
+      StringWriter sw = new StringWriter();
+      BufferedWriter out = new BufferedWriter(sw);
+
+      try {
+        if (hypergraph != null) {
+          if (!joshuaConfiguration.hypergraphFilePattern.equals("")) {
+            
hypergraph.dump(String.format(joshuaConfiguration.hypergraphFilePattern, 
source.id()), featureFunctions);
+          }
 
-    StringWriter sw = new StringWriter();
-    BufferedWriter out = new BufferedWriter(sw);
-
-    try {
-      if (hypergraph != null) {
-        if (!joshuaConfiguration.hypergraphFilePattern.equals("")) {
-          
hypergraph.dump(String.format(joshuaConfiguration.hypergraphFilePattern, 
source.id()), featureFunctions);
-        }
-
-        long startTime = System.currentTimeMillis();
+          long startTime = System.currentTimeMillis();
 
         // We must put this weight as zero, otherwise we get an error when we 
try to retrieve it
         // without checking
@@ -75,9 +88,15 @@ public class Translation {
               .replace("%S", DeNormalize.processSingleLine(rawTranslation))
               .replace("%c", String.format("%.3f", 
hypergraph.goalNode.getScore()))
               .replace("%i", String.format("%d", source.id()));
+          
+          /* %a causes output of word level alignments between input and 
output hypothesis */
+          if (joshuaConfiguration.outputFormat.contains("%a")) {
+            translation = translation.replace("%a", 
ViterbiExtractor.extractViterbiAlignment(hypergraph.goalNode));
+          }
 
           out.write(translation);
           out.newLine();
+          
         } else  {
           KBestExtractor kBestExtractor = new KBestExtractor(source, 
featureFunctions, Decoder.weights, false, joshuaConfiguration);
           kBestExtractor.lazyKBestExtractOnHG(hypergraph, 
joshuaConfiguration.topN, out);
@@ -91,9 +110,11 @@ public class Translation {
           }
         }
 
-        float seconds = (float) (System.currentTimeMillis() - startTime) / 
1000.0f;
-        Decoder.LOG(1, String.format("Input %d: %d-best extraction took %.3f 
seconds", id(),
-            joshuaConfiguration.topN, seconds));
+          float seconds = (float) (System.currentTimeMillis() - startTime) / 
1000.0f;
+          Decoder.LOG(1, String.format("Input %d: %d-best extraction took %.3f 
seconds", id(),
+              joshuaConfiguration.topN, seconds));
+          this.translationTime = seconds;
+          
 
       } else {
         
@@ -113,10 +134,14 @@ public class Translation {
         out.newLine();
       }
 
-      out.flush();
-    } catch (IOException e) {
-      e.printStackTrace();
-      System.exit(1);
+        out.flush();
+      } catch (IOException e) {
+        e.printStackTrace();
+        System.exit(1);
+      }
+      
+      this.output = sw.toString();
+      
     }
 
     /*
@@ -129,8 +154,40 @@ public class Translation {
         break;
       }
     }
+    
+  }
 
-    this.output = sw.toString();
+  /**
+   * Instead of returning a single string with output information appended
+   * (if JoshuaConfig.use_structured_output == false),
+   * write Viterbi information (score, translation, word alignment) to member
+   * variables for easier access from outside pipelines.
+   */
+  private void createStructuredOutput(Sentence source, HyperGraph hypergraph) {
+    
+    this.translationTime = 0;
+    
+    long startTime = System.currentTimeMillis();
+
+    if (hypergraph != null) {
+
+      this.output = 
ViterbiExtractor.extractViterbiString(hypergraph.goalNode).trim();
+      // trims whitespaces (same idiom as in existing Joshua code (65)
+      this.output = this.output.substring(this.output.indexOf(' ') + 1, 
this.output.lastIndexOf(' ')); 
+      this.alignment = 
ViterbiExtractor.buildViterbiAlignment(hypergraph.goalNode);
+      this.score = hypergraph.goalNode.getScore();
+
+    } else {
+      
+      this.output = this.source.source();
+      this.alignment = null;
+      
+    }
+    
+    this.translationTime = (System.currentTimeMillis() - startTime) / 1000.0f;
+    
+    Decoder.LOG(1, String.format("Translation %d: %.3f %s (%.3f)", 
source.id(), hypergraph.goalNode.getScore(), this.output, 
this.translationTime));
+    
   }
 
   public Sentence getSourceSentence() {
@@ -145,4 +202,33 @@ public class Translation {
   public String toString() {
     return output;
   }
+  
+  public float getTranslationTime() {
+    return translationTime;
+  }
+
+  public String getTranslationString() {
+    return (output == null) ? "" : output.trim();
+  }
+
+  public List<List<Integer>> getWordAlignment() {
+    return alignment.toFinalList();
+  }
+  
+  public String getWordAlignmentString() {
+    return (alignment == null) ? "" : alignment.toFinalString();
+  }
+
+  public float getTranslationScore() {
+    return score;
+  }
+  
+  public List<String> getTranslationTokens() {
+    return Arrays.asList(getTranslationString().split("\\s+"));
+  }
+  
+  public String getDeNormalizedTranslation() {
+    return DeNormalize.processSingleLine(getTranslationString());
+  }
+  
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/ff/tm/Rule.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/Rule.java 
b/src/joshua/decoder/ff/tm/Rule.java
index 2f3ec47..90a44d5 100644
--- a/src/joshua/decoder/ff/tm/Rule.java
+++ b/src/joshua/decoder/ff/tm/Rule.java
@@ -1,8 +1,11 @@
 package joshua.decoder.ff.tm;
 
+import java.util.ArrayList;
 import java.util.Arrays;  
 import java.util.Comparator;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.regex.Pattern;
 
 import joshua.corpus.Vocabulary;
@@ -411,6 +414,39 @@ public class Rule implements Comparator<Rule>, 
Comparable<Rule> {
         nts[index++] = -id;
     return nts;
   }
+  
+  /**
+   * Returns an array of size getArity() containing the source indeces of non 
terminals.
+   */
+  public int[] getNonTerminalSourcePositions() {
+    int[] nonTerminalPositions = new int[getArity()];
+    int ntPos = 0;
+    for (int sourceIdx = 0; sourceIdx < getFrench().length; sourceIdx++) {
+      if (getFrench()[sourceIdx] < 0)
+        nonTerminalPositions[ntPos++] = sourceIdx;
+    }
+    return nonTerminalPositions;
+  }
+  
+  /**
+   * Parses the Alignment byte[] into a Map from target to (possibly a list 
of) source positions.
+   * Used by the WordAlignmentExtractor.
+   */
+  public Map<Integer, List<Integer>> getAlignmentMap() {
+    byte[] alignmentArray = getAlignment();
+    Map<Integer, List<Integer>> alignmentMap = new HashMap<Integer, 
List<Integer>>();
+    if (alignmentArray != null) {
+      for (int alignmentIdx = 0; alignmentIdx < alignmentArray.length; 
alignmentIdx += 2 ) {
+        int s = alignmentArray[alignmentIdx];
+        int t = alignmentArray[alignmentIdx + 1];
+        List<Integer> values = alignmentMap.get(t);
+        if (values == null)
+          alignmentMap.put(t, values = new ArrayList<Integer>());
+        values.add(s);
+      }
+    }
+    return alignmentMap;
+  }
 
   /**
    * Return the English (target) nonterminals as list of Strings

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/hypergraph/AlignedSourceTokens.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/AlignedSourceTokens.java 
b/src/joshua/decoder/hypergraph/AlignedSourceTokens.java
new file mode 100644
index 0000000..eb92f0d
--- /dev/null
+++ b/src/joshua/decoder/hypergraph/AlignedSourceTokens.java
@@ -0,0 +1,93 @@
+package joshua.decoder.hypergraph;
+
+import java.util.LinkedList;
+import java.util.ListIterator;
+
+/**
+ * Class that represents a one to (possibly) many alignment from target to
+ * source. Extends from a LinkedList. Instances of this class are updated by 
the
+ * WordAlignmentExtractor.substitute() method. The <shiftBy> method shifts the
+ * elements in the list by a scalar to reflect substitutions of non terminals 
in
+ * the rule. if indexes are final, i.e. the point instance has been substituted
+ * into a parent WordAlignmentState once, <isFinal> is set to true. This is
+ * necessary since the final source index of a point is known once we have
+ * substituted in a complete WordAlignmentState into its parent. If the index 
in
+ * the list is a non terminal, <isNonTerminal> = true
+ */
+class AlignedSourceTokens extends LinkedList<Integer> {
+
+  private static final long serialVersionUID = 1L;
+  /** whether this Point refers to a non terminal in source&target */
+  private boolean isNonTerminal = false;
+  /** whether this instance does not need to be updated anymore */
+  private boolean isFinal = false;
+  /** whether the word this Point corresponds to has no alignment in source */
+  private boolean isNull = false;
+
+  AlignedSourceTokens() {
+  }
+
+  void setFinal() {
+    isFinal = true;
+  }
+
+  void setNonTerminal() {
+    isNonTerminal = true;
+  }
+
+  void setNull() {
+    isNull = true;
+  }
+
+  @Override
+  /**
+   * returns true if element was added.
+   */
+  public boolean add(Integer x) {
+    if (isNull || isNonTerminal)
+      return false;
+    return super.add(x);
+  }
+
+  public boolean isNonTerminal() {
+    return isNonTerminal;
+  }
+
+  public boolean isFinal() {
+    return isFinal;
+  }
+
+  public boolean isNull() {
+    return isNull;
+  }
+
+  /**
+   * shifts each item in the LinkedList by <shift>.
+   * Only applies to items larger than <start>
+   */
+  void shiftBy(int start, int shift) {
+    if (!isFinal && !isNull) {
+      ListIterator<Integer> it = this.listIterator();
+      while (it.hasNext()) {
+        int x = it.next();
+        if (x > start) {
+          it.set(x + shift);
+        }
+      }
+    }
+  }
+
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    if (isFinal)
+      sb.append("f");
+    if (isNull) {
+      sb.append("[NULL]");
+    } else {
+      sb.append(super.toString());
+    }
+    if (isNonTerminal)
+      sb.append("^");
+    return sb.toString();
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java 
b/src/joshua/decoder/hypergraph/KBestExtractor.java
index 78cf6fe..46ab9ea 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -188,6 +188,12 @@ public class KBestExtractor {
       if (joshuaConfiguration.outputFormat.contains("%d")) {
         outputString = outputString.replace("%d", 
derivationState.getDerivation());
       }
+      
+      /* %a causes output of word level alignments between input and output 
hypothesis */
+      if (joshuaConfiguration.outputFormat.contains("%a")) {
+        outputString = outputString.replace("%a",  
derivationState.getWordAlignment());
+      }
+      
     }
 
     return outputString;
@@ -581,6 +587,10 @@ public class KBestExtractor {
       bleu = 0.0f;
     }
 
+    public String getWordAlignment() {
+      return visit(new WordAlignmentExtractor()).toString();
+    }
+
     /**
      * Computes a scaled approximate BLEU from the accumulated statistics. We 
know the number of
      * words; to compute the effective reference length, we take the real 
reference length statistic

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/hypergraph/ViterbiExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiExtractor.java 
b/src/joshua/decoder/hypergraph/ViterbiExtractor.java
index e19bb3f..95ec7c7 100644
--- a/src/joshua/decoder/hypergraph/ViterbiExtractor.java
+++ b/src/joshua/decoder/hypergraph/ViterbiExtractor.java
@@ -39,6 +39,36 @@ public class ViterbiExtractor {
     }
     return res.toString();
   }
+  
+  public static String extractViterbiAlignment(HGNode node) {
+    WordAlignmentState viterbiAlignment = buildViterbiAlignment(node);
+    return viterbiAlignment.toFinalString();
+  }
+  
+  // get one-best alignment for Viterbi string
+  public static WordAlignmentState buildViterbiAlignment(HGNode node) {
+    HyperEdge edge = node.bestHyperedge;
+    Rule rl = edge.getRule();  
+    if (rl == null) { // deductions under "goal item" does not have rule
+      if (edge.getTailNodes().size() != 1)
+        throw new RuntimeException("deduction under goal item have not equal 
one item");
+      return buildViterbiAlignment(edge.getTailNodes().get(0));
+    }
+    WordAlignmentState waState = new WordAlignmentState(rl, node.i);
+    if (edge.getTailNodes() != null) {
+      int[] english = rl.getEnglish();
+      for (int c = 0; c < english.length; c++) {
+        if (Vocabulary.nt(english[c])) {
+          // determines the index in the tail node array by
+          // the index of the nonterminal in the source [english[c] gives a 
negative
+          // int]
+          int index = -(english[c] + 1);
+          
waState.substituteIn(buildViterbiAlignment(edge.getTailNodes().get(index)));
+        }
+      }
+    }
+    return waState;
+  }
 
   // ######## find 1best hypergraph#############
   public static HyperGraph getViterbiTreeHG(HyperGraph hg_in) {

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java 
b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
new file mode 100644
index 0000000..63619ee
--- /dev/null
+++ b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
@@ -0,0 +1,55 @@
+package joshua.decoder.hypergraph;
+
+import java.util.Stack;
+
+import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.ff.tm.Rule;
+import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
+import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor;
+
+/**
+ * this class implements Joshua's Derivation Visitor interface.
+ * before() and after() methods are called at each visit of a rule in 
+ * the hypergraph.
+ * We place WordAlignmentStates on a stack and merge/substitute them into each
+ * other if possible. At the end, the remaining last state on the stack 
+ * should be complete (no NonTerminals to substitute anymore).
+ */
+public class WordAlignmentExtractor implements DerivationVisitor {
+
+  private Stack<WordAlignmentState> stack;
+
+  public WordAlignmentExtractor() {
+    stack = new Stack<WordAlignmentState>();
+  }
+
+  void merge(WordAlignmentState astate) {
+    // if alignment state has no NTs left AND stack is not empty
+    // AND parent object on stack still needs something to substitute
+    if (astate.isComplete() && stack.size() > 0 && !stack.peek().isComplete()) 
{
+      WordAlignmentState parentState = stack.pop();
+      parentState.substituteIn(astate);
+      merge(parentState);
+    } else {
+      stack.add(astate);
+    }
+  }
+
+  @Override
+  public void before(DerivationState state, int level) {
+    Rule rule = state.edge.getRule();
+    if (rule != null) {
+      merge(new WordAlignmentState(rule, state.parentNode.i));
+    }
+  }
+
+  @Override
+  public void after(DerivationState state, int level) {
+  }
+
+  public String toString() {
+    WordAlignmentState finalState = stack.pop();
+    return finalState.toFinalString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/src/joshua/decoder/hypergraph/WordAlignmentState.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentState.java 
b/src/joshua/decoder/hypergraph/WordAlignmentState.java
new file mode 100644
index 0000000..e3b9598
--- /dev/null
+++ b/src/joshua/decoder/hypergraph/WordAlignmentState.java
@@ -0,0 +1,154 @@
+package joshua.decoder.hypergraph;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Map;
+
+import joshua.decoder.ff.tm.Rule;
+
+/**
+ * This class encodes a derivation state in terms of a list of alignment 
points.
+ * Whenever a child instance is substituted into the parent instance, we need 
to
+ * adjust source indexes of the alignments.
+ * 
+ * @author fhieber
+ */
+public class WordAlignmentState {
+
+  /**
+   * each element in this list corresponds to a token on the target side of the
+   * rule. The values of the elements correspond to the aligned source token on
+   * the source side of the rule.
+   */
+  private LinkedList<AlignedSourceTokens> trgPoints;
+  private int srcStart;
+  /** number of NTs we need to substitute. */
+  private int numNT;
+  /** grows with substitutions of child rules. Reaches original Rule span if 
substitutions are complete */
+  private int srcLength;
+
+  /**
+   * construct AlignmentState object from a virgin Rule and its source span.
+   * Determines if state is complete (if no NT present)
+   */
+  WordAlignmentState(Rule rule, int start) {
+    trgPoints = new LinkedList<AlignedSourceTokens>();
+    srcLength = rule.getFrench().length;
+    numNT = rule.getArity();
+    srcStart = start;
+    Map<Integer, List<Integer>> alignmentMap = rule.getAlignmentMap();
+    int[] nonTermPositions = rule.getNonTerminalSourcePositions();
+    int[] trg = rule.getEnglish();
+    // for each target index, create a TargetAlignmentPoint
+    for (int trgIndex = 0; trgIndex < trg.length; trgIndex++) {
+      AlignedSourceTokens trgPoint = new AlignedSourceTokens();
+
+      if (trg[trgIndex] >= 0) { // this is a terminal symbol, check for 
alignment
+        if (alignmentMap.containsKey(trgIndex)) {
+          // add source indexes to TargetAlignmentPoint
+          for (int srcIdx : alignmentMap.get(trgIndex)) {
+            trgPoint.add(srcStart + srcIdx);
+          }
+        } else { // this target word is NULL-aligned
+          trgPoint.setNull();
+        }
+      } else { // this is a nonterminal ([X]) [actually its the (negative) 
index of the NT in the source
+        trgPoint.setNonTerminal();
+        trgPoint.add(srcStart + nonTermPositions[Math.abs(trg[trgIndex]) - 1]);
+      }
+      trgPoints.add(trgPoint);
+    }
+  }
+
+  /**
+   * if there are no more NonTerminals to substitute,
+   * this state is said to be complete
+   */
+  public boolean isComplete() {
+    return numNT == 0;
+  }
+
+  /**
+   * builds the final alignment string in the standard alignment format: src -
+   * trg. Sorted by trg indexes. Disregards the sentence markers.
+   */
+  public String toFinalString() {
+    StringBuilder sb = new StringBuilder();
+    int t = 0;
+    for (AlignedSourceTokens pt : trgPoints) {
+      for (int s : pt)
+        sb.append(String.format(" %d-%d", s-1, t-1)); // disregard sentence
+                                                      // markers
+      t++;
+    }
+    String result = sb.toString();
+    if (!result.isEmpty())
+      return result.substring(1);
+    return result;
+  }
+  
+  /**
+   * builds the final alignment list.
+   * each entry in the list corresponds to a list of aligned source tokens.
+   * First and last item in trgPoints is skipped.
+   */
+  public List<List<Integer>> toFinalList() {
+    assert (isComplete() == true);
+    List<List<Integer>> alignment = new ArrayList<List<Integer>> ();
+    if (trgPoints.isEmpty())
+      return alignment;
+    ListIterator<AlignedSourceTokens> it = trgPoints.listIterator();
+    it.next(); // skip first item (sentence marker)
+    while (it.hasNext()) {
+      AlignedSourceTokens alignedSourceTokens = it.next();
+      if (it.hasNext()) { // if not last element in trgPoints
+        List<Integer> newAlignedSourceTokens = new ArrayList<Integer>();
+        for (Integer sourceIndex : alignedSourceTokens)
+          newAlignedSourceTokens.add(sourceIndex - 1); // shift by one to 
disregard sentence marker
+        alignment.add(newAlignedSourceTokens);
+      }
+    }
+    return alignment;
+  }
+
+  /**
+   * String representation for debugging.
+   */
+  public String toString() {
+    return String.format("%s , len=%d start=%d, isComplete=%s",
+        trgPoints.toString(), srcLength, srcStart, this.isComplete());
+  }
+
+  /**
+   * substitutes a child WorldAlignmentState into this instance at the first
+   * NT it finds. Also shifts the indeces in this instance by the span/width 
of the
+   * child that is to be substituted.
+   * Substitution order is determined by the architecture of Joshua's 
hypergraph.
+   */
+  void substituteIn(WordAlignmentState child) {
+    // update existing indexes by length of child (has no effect on NULL and
+    // NonTerminal points)
+    for (AlignedSourceTokens trgPoint : trgPoints)
+      trgPoint.shiftBy(child.srcStart, child.srcLength - 1);
+
+    // now substitute in the child at first NT, modifying the list
+    ListIterator<AlignedSourceTokens> it = trgPoints.listIterator();
+    while (it.hasNext()) {
+      AlignedSourceTokens trgPoint = it.next();
+      if (trgPoint.isNonTerminal()) { // found first NT
+        it.remove(); // remove NT symbol
+        for (AlignedSourceTokens childElement : child.trgPoints) {
+          childElement.setFinal(); // child source indexes are final, do not 
change them anymore
+          it.add(childElement);
+        }
+        this.srcLength += child.srcLength - 1; // -1 (NT)
+        this.numNT--;
+        break;
+      }
+    }
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/tst/joshua/system/AlignmentMapTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/AlignmentMapTest.java 
b/tst/joshua/system/AlignmentMapTest.java
new file mode 100644
index 0000000..0eee8c8
--- /dev/null
+++ b/tst/joshua/system/AlignmentMapTest.java
@@ -0,0 +1,53 @@
+package joshua.system;
+
+import static org.junit.Assert.*;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import joshua.corpus.Vocabulary;
+import joshua.decoder.ff.tm.Rule;
+
+import org.junit.Before;
+import org.junit.Test;
+
+public class AlignmentMapTest {
+  
+  private Rule rule1 = null;
+  private Rule rule2 = null;
+  private static Map<Integer, List<Integer>> expectedAlignmentMap = null;
+  private static final int[] expectedNonTerminalPositions = {2,5};
+
+  @Before
+  public void setUp() throws Exception {
+    int[] sourceRhs = 
{Vocabulary.id("A1"),Vocabulary.id("A2"),-1,Vocabulary.id("B"),Vocabulary.id("C"),-2};
+    int[] targetRhs = 
{Vocabulary.id("c"),Vocabulary.id("b1"),-1,Vocabulary.id("b2"),-4,Vocabulary.id("a")};
+    int arity = 2; // 2 non terminals
+    String alignment = "0-5 1-5 3-1 3-3 4-0";
+    expectedAlignmentMap = new HashMap<Integer, List<Integer>>();
+    expectedAlignmentMap.put(0, Arrays.asList(4));
+    expectedAlignmentMap.put(5, Arrays.asList(0,1));
+    expectedAlignmentMap.put(1, Arrays.asList(3));
+    expectedAlignmentMap.put(3, Arrays.asList(3));
+    rule1 = new Rule(-1, sourceRhs, targetRhs, "", arity, alignment);
+    rule2 = new Rule(-1, sourceRhs, targetRhs, "", arity, null); // rule with 
no alignment
+  }
+
+  @Test
+  public void test() {
+    // test regular rule with arity 2
+    Map<Integer, List<Integer>> alignmentMap1 = rule1.getAlignmentMap();
+    assertEquals(expectedAlignmentMap, alignmentMap1);
+    int[] nonTerminalPositions1 = rule1.getNonTerminalSourcePositions();
+    assertArrayEquals(expectedNonTerminalPositions, nonTerminalPositions1);
+    
+    // test rule with no alignment
+    Map<Integer, List<Integer>> alignmentMap2 = rule2.getAlignmentMap();
+    assertTrue(alignmentMap2.isEmpty());
+    int[] nonTerminalPositions2 = rule2.getNonTerminalSourcePositions();
+    assertArrayEquals(expectedNonTerminalPositions, nonTerminalPositions2);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9501535d/tst/joshua/system/StructuredOutputTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/StructuredOutputTest.java 
b/tst/joshua/system/StructuredOutputTest.java
new file mode 100644
index 0000000..981f9d8
--- /dev/null
+++ b/tst/joshua/system/StructuredOutputTest.java
@@ -0,0 +1,103 @@
+package joshua.system;
+
+import java.util.Arrays;
+import java.util.List;
+
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.Translation;
+import joshua.decoder.segment_file.Sentence;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.Assert;
+
+/**
+ * Integration test for the complete Joshua decoder using a toy grammar that 
translates
+ * a bunch of capital letters to lowercase letters. Rules in the test grammar
+ * drop and generate additional words and simulate reordering of rules, so that
+ * proper extraction of word alignments can be tested.
+ * 
+ * @author fhieber
+ */
+public class StructuredOutputTest {
+
+  private JoshuaConfiguration joshuaConfig = null;
+  private Decoder decoder = null;
+  private Translation translation = null;
+  private static final String input = "A K B1 U Z1 Z2 B2 C";
+  private static final String expectedTranslation = "a b n1 u z c1 k1 k2 k3 n1 
n2 n3 c2";
+  private static final String expectedWordAlignmentString = "0-0 2-1 6-1 3-3 
4-4 5-4 7-5 1-6 1-7 1-8 7-12";
+  private static final List<List<Integer>> expectedWordAlignment = 
Arrays.asList(
+      Arrays.asList(0), Arrays.asList(2, 6), Arrays.asList(), Arrays.asList(3),
+      Arrays.asList(4, 5), Arrays.asList(7), Arrays.asList(1),
+      Arrays.asList(1), Arrays.asList(1), Arrays.asList(), Arrays.asList(),
+      Arrays.asList(), Arrays.asList(7));
+  private static final double expectedScore = -17.0;
+
+  @Before
+  public void setUp() throws Exception {
+    joshuaConfig = new JoshuaConfiguration();
+    joshuaConfig.search_algorithm = "cky";
+    joshuaConfig.mark_oovs = false;
+    joshuaConfig.pop_limit = 100;
+    joshuaConfig.use_unique_nbest = false;
+    joshuaConfig.include_align_index = false;
+    joshuaConfig.topN = 0;
+    joshuaConfig.tms.add("thrax pt 20 resources/wa_grammar");
+    joshuaConfig.tms.add("thrax glue -1 resources/grammar.glue");
+    joshuaConfig.goal_symbol = "[GOAL]";
+    joshuaConfig.default_non_terminal = "[X]";
+    joshuaConfig.features.add("feature_function = OOVPenalty");
+    joshuaConfig.weights.add("tm_pt_0 1");
+    joshuaConfig.weights.add("tm_pt_1 1");
+    joshuaConfig.weights.add("tm_pt_2 1");
+    joshuaConfig.weights.add("tm_pt_3 1");
+    joshuaConfig.weights.add("tm_pt_4 1");
+    joshuaConfig.weights.add("tm_pt_5 1");
+    joshuaConfig.weights.add("tm_glue_0 1");
+    joshuaConfig.weights.add("OOVPenalty 2");
+    decoder = new Decoder(joshuaConfig, ""); // second argument (configFile
+                                             // is not even used by the
+                                             // constructor/initialize)
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    decoder.cleanUp();
+    decoder = null;
+    translation = null;
+  }
+
+  private Translation decode(String input) {
+    Sentence sentence = new Sentence(input, 0, joshuaConfig);
+    return decoder.decode(sentence);
+  }
+
+  @Test
+  public void test() {
+
+    // test standard output
+    joshuaConfig.use_structured_output = false;
+    joshuaConfig.outputFormat = "%s | %a ";
+    translation = decode(input);
+    Assert.assertEquals(expectedTranslation + " | "
+        + expectedWordAlignmentString, translation.toString().trim());
+
+    // test structured output
+    joshuaConfig.use_structured_output = true; // set structured output 
creation to true
+    translation = decode(input);
+    Assert
+        .assertEquals(expectedTranslation, translation.getTranslationString());
+    Assert.assertEquals(Arrays.asList(expectedTranslation.split("\\s+")),
+        translation.getTranslationTokens());
+    Assert.assertEquals(expectedScore, translation.getTranslationScore(),
+        0.00001);
+    Assert.assertEquals(expectedWordAlignment, translation.getWordAlignment());
+    Assert.assertEquals(translation.getWordAlignment().size(), translation
+        .getTranslationTokens().size());
+
+  }
+
+}

[02/10] incubator-joshua git commit: Decoder's Translation class now contains more members including the possibility to store word alignment from the derivation. Allows use of Joshua decoder class in a larger code project to extract information, rather t

Reply via email to