kellens: Use Guava's memoize for expensive calls, removed unneeded members fhieber: Important bugfix for obtaining word alignments from packedRules in multi-threading environment
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/cabb52ca Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/cabb52ca Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/cabb52ca Branch: refs/heads/master Commit: cabb52cabd5a81088b21b9e01a4668ebb2a85ffa Parents: 244e693 Author: Kellen Sunderland <[email protected]> Authored: Wed Oct 14 17:13:18 2015 +0200 Committer: Kellen Sunderland <[email protected]> Committed: Thu Mar 31 10:44:43 2016 +0200 ---------------------------------------------------------------------- resources/wa_grammar.packed/config | 1 + resources/wa_grammar.packed/encoding | Bin 0 -> 154 bytes .../wa_grammar.packed/slice_00000.alignments | Bin 0 -> 45 bytes .../wa_grammar.packed/slice_00000.features | Bin 0 -> 47 bytes resources/wa_grammar.packed/slice_00000.source | Bin 0 -> 204 bytes resources/wa_grammar.packed/slice_00000.target | Bin 0 -> 128 bytes .../wa_grammar.packed/slice_00000.target.lookup | Bin 0 -> 32 bytes resources/wa_grammar.packed/vocabulary | Bin 0 -> 238 bytes src/joshua/decoder/ff/tm/PhraseRule.java | 21 ++-- src/joshua/decoder/ff/tm/Rule.java | 42 ++++--- .../decoder/ff/tm/packed/PackedGrammar.java | 121 +++++++++++++------ .../system/MultithreadedTranslationTests.java | 113 +++++++++++++++++ 12 files changed, 240 insertions(+), 58 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/resources/wa_grammar.packed/config ---------------------------------------------------------------------- diff --git a/resources/wa_grammar.packed/config b/resources/wa_grammar.packed/config new file mode 100644 index 0000000..ebd1bf1 --- /dev/null +++ b/resources/wa_grammar.packed/config @@ -0,0 +1 @@ +max-source-len = 6 http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/resources/wa_grammar.packed/encoding ---------------------------------------------------------------------- diff --git a/resources/wa_grammar.packed/encoding b/resources/wa_grammar.packed/encoding new file mode 100644 index 0000000..630f69f Binary files /dev/null and b/resources/wa_grammar.packed/encoding differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/resources/wa_grammar.packed/slice_00000.alignments ---------------------------------------------------------------------- diff --git a/resources/wa_grammar.packed/slice_00000.alignments b/resources/wa_grammar.packed/slice_00000.alignments new file mode 100644 index 0000000..f1425eb Binary files /dev/null and b/resources/wa_grammar.packed/slice_00000.alignments differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/resources/wa_grammar.packed/slice_00000.features ---------------------------------------------------------------------- diff --git a/resources/wa_grammar.packed/slice_00000.features b/resources/wa_grammar.packed/slice_00000.features new file mode 100644 index 0000000..5a4c774 Binary files /dev/null and b/resources/wa_grammar.packed/slice_00000.features differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/resources/wa_grammar.packed/slice_00000.source ---------------------------------------------------------------------- diff --git a/resources/wa_grammar.packed/slice_00000.source b/resources/wa_grammar.packed/slice_00000.source new file mode 100644 index 0000000..4607b89 Binary files /dev/null and b/resources/wa_grammar.packed/slice_00000.source differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/resources/wa_grammar.packed/slice_00000.target ---------------------------------------------------------------------- diff --git a/resources/wa_grammar.packed/slice_00000.target b/resources/wa_grammar.packed/slice_00000.target new file mode 100644 index 0000000..fe11a38 Binary files /dev/null and b/resources/wa_grammar.packed/slice_00000.target differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/resources/wa_grammar.packed/slice_00000.target.lookup ---------------------------------------------------------------------- diff --git a/resources/wa_grammar.packed/slice_00000.target.lookup b/resources/wa_grammar.packed/slice_00000.target.lookup new file mode 100644 index 0000000..7d82179 Binary files /dev/null and b/resources/wa_grammar.packed/slice_00000.target.lookup differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/resources/wa_grammar.packed/vocabulary ---------------------------------------------------------------------- diff --git a/resources/wa_grammar.packed/vocabulary b/resources/wa_grammar.packed/vocabulary new file mode 100644 index 0000000..637651e Binary files /dev/null and b/resources/wa_grammar.packed/vocabulary differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/src/joshua/decoder/ff/tm/PhraseRule.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/ff/tm/PhraseRule.java b/src/joshua/decoder/ff/tm/PhraseRule.java index e42a7ee..c178b31 100644 --- a/src/joshua/decoder/ff/tm/PhraseRule.java +++ b/src/joshua/decoder/ff/tm/PhraseRule.java @@ -1,5 +1,8 @@ package joshua.decoder.ff.tm; +import com.google.common.base.Supplier; +import com.google.common.base.Suppliers; + /*** * A class for reading in rules from a Moses phrase table. Most of the conversion work is done * in {@link joshua.decoder.ff.tm.format.PhraseFormatReader}. This includes prepending every @@ -19,11 +22,20 @@ package joshua.decoder.ff.tm; public class PhraseRule extends Rule { private String mosesFeatureString = null; + private Supplier<byte[]> alignmentSupplier; public PhraseRule(int lhs, int[] french, int[] english, String sparse_features, int arity, String alignment) { super(lhs, french, english, null, arity, alignment); mosesFeatureString = sparse_features; + this.alignmentSupplier = Suppliers.memoize(() ->{ + String[] tokens = getAlignmentString().split("[-\\s]+"); + byte[] alignmentArray = new byte[tokens.length + 2]; + alignmentArray[0] = alignmentArray[1] = 0; + for (int i = 0; i < tokens.length; i++) + alignmentArray[i + 2] = (byte) (Short.parseShort(tokens[i]) + 1); + return alignmentArray; + }); } /** @@ -49,13 +61,6 @@ public class PhraseRule extends Rule { */ @Override public byte[] getAlignment() { - if (alignment == null) { - String[] tokens = getAlignmentString().split("[-\\s]+"); - alignment = new byte[tokens.length + 2]; - alignment[0] = alignment[1] = 0; - for (int i = 0; i < tokens.length; i++) - alignment[i + 2] = (byte) (Short.parseShort(tokens[i]) + 1); - } - return alignment; + return this.alignmentSupplier.get(); } } http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/src/joshua/decoder/ff/tm/Rule.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/ff/tm/Rule.java b/src/joshua/decoder/ff/tm/Rule.java index 90a44d5..3d715ea 100644 --- a/src/joshua/decoder/ff/tm/Rule.java +++ b/src/joshua/decoder/ff/tm/Rule.java @@ -8,6 +8,9 @@ import java.util.List; import java.util.Map; import java.util.regex.Pattern; +import com.google.common.base.Supplier; +import com.google.common.base.Suppliers; + import joshua.corpus.Vocabulary; import joshua.decoder.Decoder; import joshua.decoder.ff.FeatureFunction; @@ -43,6 +46,8 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> { protected FeatureVector features = null; protected String sparseFeatureString; + private final Supplier<byte[]> alignmentSupplier; + /* * a feature function will be fired for this rule only if the owner of the rule matches the owner * of the feature function @@ -63,7 +68,6 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> { // The alignment string, e.g., 0-0 0-1 1-1 2-1 private String alignmentString; - protected byte[] alignment = null; /** * Constructs a new rule using the provided parameters. The owner and rule id for this rule are @@ -86,16 +90,12 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> { this.arity = arity; this.owner = owner; this.english = targetRhs; + alignmentSupplier = initializeAlignmentSupplier(); } // Sparse feature version public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity) { - this.lhs = lhs; - this.pFrench = sourceRhs; - this.sparseFeatureString = sparseFeatures; - this.arity = arity; - this.owner = -1; - this.english = targetRhs; + this(lhs, sourceRhs, targetRhs, sparseFeatures, arity, -1); } public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, String alignment) { @@ -105,6 +105,22 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> { public Rule() { this.lhs = -1; + alignmentSupplier = initializeAlignmentSupplier(); + } + + private Supplier<byte[]> initializeAlignmentSupplier(){ + Supplier<byte[]> result = Suppliers.memoize(() ->{ + byte[] alignment = null; + String alignmentString = getAlignmentString(); + if (alignmentString != null) { + String[] tokens = alignmentString.split("[-\\s]+"); + alignment = new byte[tokens.length]; + for (int i = 0; i < tokens.length; i++) + alignment[i] = (byte) Short.parseShort(tokens[i]); + } + return alignment; + }); + return result; } // =============================================================== @@ -351,23 +367,17 @@ public class Rule implements Comparator<Rule>, Comparable<Rule> { public String getFeatureString() { return sparseFeatureString; } - + /** * Returns an alignment as a sequence of integers. The integers at positions i and i+1 are paired, * with position i indexing the source and i+1 the target. */ public byte[] getAlignment() { - if (alignment == null && getAlignmentString() != null) { - String[] tokens = getAlignmentString().split("[-\\s]+"); - alignment = new byte[tokens.length]; - for (int i = 0; i < tokens.length; i++) - alignment[i] = (byte) Short.parseShort(tokens[i]); - } - return alignment; + return this.alignmentSupplier.get(); } public String getAlignmentString() { - return alignmentString; + return this.alignmentString; } /** http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/src/joshua/decoder/ff/tm/packed/PackedGrammar.java ---------------------------------------------------------------------- diff --git a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java index dc72a4b..2251f5a 100644 --- a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java +++ b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java @@ -79,6 +79,8 @@ import joshua.util.encoding.EncoderConfiguration; import joshua.util.encoding.FloatEncoder; import joshua.util.io.LineReader; +import com.google.common.base.Supplier; +import com.google.common.base.Suppliers; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; @@ -790,32 +792,63 @@ public class PackedGrammar extends AbstractGrammar { * */ public final class PackedPhrasePair extends PackedRule { + + private final Supplier<int[]> englishSupplier; + private final Supplier<byte[]> alignmentSupplier; + public PackedPhrasePair(int address) { super(address); + englishSupplier = initializeEnglishSupplier(); + alignmentSupplier = initializeAlignmentSupplier(); } @Override public int getArity() { return PackedTrie.this.getArity() + 1; } - + /** - * Take the English phrase of the underlying rule and prepend an [X]. - * - * @return + * Initialize a number of suppliers which get evaluated when their respective getters + * are called. + * Inner lambda functions are guaranteed to only be called once, because of this underlying + * structures are accessed in a threadsafe way. + * Guava's implementation makes sure only one read of a volatile variable occurs per get. + * This means this implementation should be as thread-safe and performant as possible. */ - @Override - public int[] getEnglish() { - if (tgt == null) { + + private Supplier<int[]> initializeEnglishSupplier(){ + Supplier<int[]> result = Suppliers.memoize(() ->{ int[] phrase = getTarget(source[address + 1]); - tgt = new int[phrase.length + 1]; + int[] tgt = new int[phrase.length + 1]; tgt[0] = -1; for (int i = 0; i < phrase.length; i++) tgt[i+1] = phrase[i]; - } - return tgt; + return tgt; + }); + return result; } + private Supplier<byte[]> initializeAlignmentSupplier(){ + Supplier<byte[]> result = Suppliers.memoize(() ->{ + byte[] raw_alignment = getAlignmentArray(source[address + 2]); + byte[] points = new byte[raw_alignment.length + 2]; + points[0] = points[1] = 0; + for (int i = 0; i < raw_alignment.length; i++) + points[i + 2] = (byte) (raw_alignment[i] + 1); + return points; + }); + return result; + } + + /** + * Take the English phrase of the underlying rule and prepend an [X]. + * + * @return + */ + @Override + public int[] getEnglish() { + return this.englishSupplier.get(); + } /** * Take the French phrase of the underlying rule and prepend an [X]. @@ -838,27 +871,51 @@ public class PackedGrammar extends AbstractGrammar { */ @Override public byte[] getAlignment() { - // alignments is the underlying raw alignment data - if (alignments != null) { - byte[] a = getAlignmentArray(source[address + 2]); - byte[] points = new byte[a.length + 2]; - points[0] = points[1] = 0; - for (int i = 0; i < a.length; i++) - points[i + 2] = (byte) (a[i] + 1); - return points; + // if no alignments in grammar do not fail + if (alignments == null) { + return null; } - return null; + + return this.alignmentSupplier.get(); } } public class PackedRule extends Rule { protected final int address; - - protected int[] tgt = null; - private FeatureVector features = null; + private final Supplier<int[]> englishSupplier; + private final Supplier<FeatureVector> featureVectorSupplier; + private final Supplier<byte[]> alignmentsSupplier; public PackedRule(int address) { this.address = address; + this.englishSupplier = intializeEnglishSupplier(); + this.featureVectorSupplier = initializeFeatureVectorSupplier(); + this.alignmentsSupplier = initializeAlignmentsSupplier(); + } + + private Supplier<int[]> intializeEnglishSupplier(){ + Supplier<int[]> result = Suppliers.memoize(() ->{ + return getTarget(source[address + 1]); + }); + return result; + } + + private Supplier<FeatureVector> initializeFeatureVectorSupplier(){ + Supplier<FeatureVector> result = Suppliers.memoize(() ->{ + return new FeatureVector(getFeatures(source[address + 2]), "tm_" + Vocabulary.word(owner) + "_"); + }); + return result; + } + + private Supplier<byte[]> initializeAlignmentsSupplier(){ + Supplier<byte[]> result = Suppliers.memoize(()->{ + // if no alignments in grammar do not fail + if (alignments == null){ + return null; + } + return getAlignmentArray(source[address + 2]); + }); + return result; } @Override @@ -894,10 +951,7 @@ public class PackedGrammar extends AbstractGrammar { @Override public int[] getEnglish() { - if (tgt == null) { - tgt = getTarget(source[address + 1]); - } - return tgt; + return this.englishSupplier.get(); } @Override @@ -911,18 +965,17 @@ public class PackedGrammar extends AbstractGrammar { @Override public FeatureVector getFeatureVector() { - if (features == null) { - features = new FeatureVector(getFeatures(source[address + 2]), "tm_" + Vocabulary.word(owner) + "_"); - } - - return features; + return this.featureVectorSupplier.get(); } @Override public byte[] getAlignment() { - if (alignments != null) - return getAlignmentArray(source[address + 2]); - return null; + return this.alignmentsSupplier.get(); + } + + @Override + public String getAlignmentString() { + throw new RuntimeException("AlignmentString not implemented for PackedRule!"); } @Override http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cabb52ca/tst/joshua/system/MultithreadedTranslationTests.java ---------------------------------------------------------------------- diff --git a/tst/joshua/system/MultithreadedTranslationTests.java b/tst/joshua/system/MultithreadedTranslationTests.java new file mode 100644 index 0000000..4ff549c --- /dev/null +++ b/tst/joshua/system/MultithreadedTranslationTests.java @@ -0,0 +1,113 @@ +package joshua.system; + +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.nio.charset.Charset; +import java.util.ArrayList; + +import joshua.corpus.Vocabulary; +import joshua.decoder.Decoder; +import joshua.decoder.JoshuaConfiguration; +import joshua.decoder.Translation; +import joshua.decoder.Translations; +import joshua.decoder.io.TranslationRequest; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Integration test for multithreaded Joshua decoder tests. Grammar used is a + * toy packed grammar. + * + * @author kellens + */ +public class MultithreadedTranslationTests { + + private JoshuaConfiguration joshuaConfig = null; + private Decoder decoder = null; + private static final String INPUT = "A K B1 U Z1 Z2 B2 C"; + + @Before + public void setUp() throws Exception { + Vocabulary.clear(); + joshuaConfig = new JoshuaConfiguration(); + joshuaConfig.search_algorithm = "cky"; + joshuaConfig.mark_oovs = false; + joshuaConfig.pop_limit = 100; + joshuaConfig.use_unique_nbest = false; + joshuaConfig.include_align_index = false; + joshuaConfig.topN = 0; + joshuaConfig.tms.add("thrax -owner pt -maxspan 20 -path resources/wa_grammar.packed"); + joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue"); + joshuaConfig.goal_symbol = "[GOAL]"; + joshuaConfig.default_non_terminal = "[X]"; + joshuaConfig.features.add("feature_function = OOVPenalty"); + joshuaConfig.weights.add("tm_pt_0 1"); + joshuaConfig.weights.add("tm_pt_1 1"); + joshuaConfig.weights.add("tm_pt_2 1"); + joshuaConfig.weights.add("tm_pt_3 1"); + joshuaConfig.weights.add("tm_pt_4 1"); + joshuaConfig.weights.add("tm_pt_5 1"); + joshuaConfig.weights.add("tm_glue_0 1"); + joshuaConfig.weights.add("OOVPenalty 2"); + joshuaConfig.num_parallel_decoders = 500; // This will enable 500 parallel + // decoders to run at once. + // Useful to help flush out + // concurrency errors in + // underlying + // data-structures. + this.decoder = new Decoder(joshuaConfig, ""); // Second argument + // (configFile) + // is not even used by the + // constructor/initialize. + } + + @After + public void tearDown() throws Exception { + Vocabulary.clear(); + this.decoder.cleanUp(); + this.decoder = null; + } + + + + // This test was created specifically to reproduce a multithreaded issue + // related to mapped byte array access in the PackedGrammer getAlignmentArray + // function. + + // We'll test the decoding engine using N = 10,000 identical inputs. This + // should be sufficient to induce concurrent data access for many shared + // data structures. + + @Test + public void givenPackedGrammar_whenNTranslationsCalledConcurrently_thenReturnNResults() { + // GIVEN + + int inputLines = 10000; + joshuaConfig.construct_structured_output = true; // Enabled alignments. + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < inputLines; i++) { + sb.append(INPUT + "\n"); + } + + // Append a large string together to simulate N requests to the decoding + // engine. + TranslationRequest req = new TranslationRequest(new ByteArrayInputStream(sb.toString() + .getBytes(Charset.forName("UTF-8"))), joshuaConfig); + + // WHEN + // Translate all spans in parallel. + Translations translations = this.decoder.decodeAll(req); + ArrayList<Translation> translationResults = new ArrayList<Translation>(); + + Translation t; + while ((t = translations.next()) != null) { + translationResults.add(t); + } + + // THEN + assertTrue(translationResults.size() == inputLines); + } +}
