Author: tommaso Date: Sun Nov 13 17:54:46 2016 New Revision: 1769529 URL: http://svn.apache.org/viewvc?rev=1769529&view=rev Log: minor perf improvements
Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java?rev=1769529&r1=1769528&r2=1769529&view=diff ============================================================================== --- labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java (original) +++ labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java Sun Nov 13 17:54:46 2016 @@ -100,8 +100,8 @@ public class RNN { wxh = Nd4j.randn(hiddenLayerSize, vocabSize).mul(0.01); whh = Nd4j.randn(hiddenLayerSize, hiddenLayerSize).mul(0.01); why = Nd4j.randn(vocabSize, hiddenLayerSize).mul(0.01); - bh = Nd4j.zeros(hiddenLayerSize, 1).mul(0.01); - by = Nd4j.zeros(vocabSize, 1).mul(0.01); + bh = Nd4j.zeros(hiddenLayerSize, 1); + by = Nd4j.zeros(vocabSize, 1); } private String[] toStrings(char[] chars) { @@ -248,7 +248,7 @@ public class RNN { // backward pass: compute gradients going backwards INDArray dhNext = Nd4j.zerosLike(hs.getRow(0)); for (int t = inputs.length() - 1; t >= 0; t--) { - INDArray dy = ps.getRow(t).dup(); + INDArray dy = ps.getRow(t); dy.putRow(targets.getInt(t), dy.getRow(targets.getInt(t)).sub(1)); // backprop into y INDArray hst = hs.getRow(t); dWhy.addi(dy.mmul(hst.transpose())); // derivative of hy layer @@ -263,11 +263,11 @@ public class RNN { } // clip exploding gradients - Nd4j.getExecutioner().execAndReturn(new SetRange(dWxh, -5, 5)); - Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh, -5, 5)); - Nd4j.getExecutioner().execAndReturn(new SetRange(dWhy, -5, 5)); - Nd4j.getExecutioner().execAndReturn(new SetRange(dbh, -5, 5)); - Nd4j.getExecutioner().execAndReturn(new SetRange(dby, -5, 5)); + Nd4j.getExecutioner().exec(new SetRange(dWxh, -5, 5)); + Nd4j.getExecutioner().exec(new SetRange(dWhh, -5, 5)); + Nd4j.getExecutioner().exec(new SetRange(dWhy, -5, 5)); + Nd4j.getExecutioner().exec(new SetRange(dbh, -5, 5)); + Nd4j.getExecutioner().exec(new SetRange(dby, -5, 5)); return loss; } @@ -292,11 +292,9 @@ public class RNN { int sampleSize = 2 * seqLength; INDArray ixes = Nd4j.create(sampleSize); - INDArray h = hPrev.dup(); - for (int t = 0; t < sampleSize; t++) { - h = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(h)).add(bh))); - INDArray y = (why.mmul(h)).add(by); + hPrev = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(hPrev)).add(bh))); + INDArray y = (why.mmul(hPrev)).add(by); INDArray pm = Nd4j.getExecutioner().execAndReturn(new SoftMax(y)).ravel(); List<Pair<Integer, Double>> d = new LinkedList<>(); Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java?rev=1769529&r1=1769528&r2=1769529&view=diff ============================================================================== --- labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java (original) +++ labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java Sun Nov 13 17:54:46 2016 @@ -70,9 +70,9 @@ public class StackedRNN extends RNN { whh = Nd4j.randn(hiddenLayerSize, hiddenLayerSize).mul(0.01); whh2 = Nd4j.randn(hiddenLayerSize, hiddenLayerSize).mul(0.01); wh2y = Nd4j.randn(vocabSize, hiddenLayerSize).mul(0.01); - bh = Nd4j.zeros(hiddenLayerSize, 1).mul(0.01); - bh2 = Nd4j.zeros(hiddenLayerSize, 1).mul(0.01); - by = Nd4j.zeros(vocabSize, 1).mul(0.01); + bh = Nd4j.zeros(hiddenLayerSize, 1); + bh2 = Nd4j.zeros(hiddenLayerSize, 1); + by = Nd4j.zeros(vocabSize, 1); } public void learn() { @@ -112,7 +112,7 @@ public class StackedRNN extends RNN { INDArray targets = getSequence(p + 1); // sample from the model every now and then - if (n % 100 == 0 && n > 0) { + if (n % 1000 == 0 && n > 0) { String txt = sample(inputs.getInt(0)); System.out.printf("\n---\n %s \n----\n", txt); } @@ -172,69 +172,63 @@ public class StackedRNN extends RNN { private double lossFun(INDArray inputs, INDArray targets, INDArray dWxh, INDArray dWhh, INDArray dWhh2, INDArray dWh2y, INDArray dbh, INDArray dbh2, INDArray dby) { - INDArray xs = Nd4j.zeros(inputs.length(), vocabSize); + INDArray xs = Nd4j.zeros(seqLength, vocabSize); INDArray hs = null; INDArray hs2 = null; INDArray ys = null; INDArray ps = null; - INDArray hs1 = hPrev.dup(); - INDArray hs12 = hPrev2.dup(); - double loss = 0; // forward pass - for (int t = 0; t < inputs.length(); t++) { + for (int t = 0; t < seqLength; t++) { int tIndex = inputs.getScalar(t).getInt(0); xs.putScalar(t, tIndex, 1); // encode in 1-of-k representation - INDArray hsRow = t == 0 ? hs1 : hs.getRow(t - 1); + INDArray hsRow = t == 0 ? hPrev : hs.getRow(t - 1); INDArray xst = xs.getRow(t); INDArray hst = Transforms.tanh((wxh.mmul(xst.transpose())).add((whh.mmul(hsRow)).add(bh))); // hidden state if (hs == null) { - hs = init(inputs.length(), hst); + hs = init(seqLength, hst); } hs.putRow(t, hst); - INDArray hs2Row = t == 0 ? hs12 : hs2.getRow(t - 1); + INDArray hs2Row = t == 0 ? hPrev2 : hs2.getRow(t - 1); INDArray hst2 = Transforms.tanh((whh.mmul(hst)).add((whh2.mmul(hs2Row)).add(bh2))); // hidden state 2 if (hs2 == null) { - hs2 = init(inputs.length(), hst2); + hs2 = init(seqLength, hst2); } hs2.putRow(t, hst2); INDArray yst = (wh2y.mmul(hst2)).add(by); // unnormalized log probabilities for next chars if (ys == null) { - ys = init(inputs.length(), yst); + ys = init(seqLength, yst); } ys.putRow(t, yst); INDArray pst = Nd4j.getExecutioner().execAndReturn(new SoftMax(yst)); // probabilities for next chars if (ps == null) { - ps = init(inputs.length(), pst); + ps = init(seqLength, pst); } ps.putRow(t, pst); - int targetsInt = targets.getInt(t); - loss += -Math.log(pst.getDouble(targetsInt)); // softmax (cross-entropy loss) + loss += -Math.log(pst.getDouble(targets.getInt(t))); // softmax (cross-entropy loss) } // backward pass: compute gradients going backwards INDArray dhNext = Nd4j.zerosLike(hs.getRow(0)); INDArray dh2Next = Nd4j.zerosLike(hs2.getRow(0)); - for (int t = inputs.length() - 1; t >= 0; t--) { - - INDArray dy = ps.getRow(t).dup(); - dy.putRow(targets.getInt(t), dy.getRow(targets.getInt(t)).sub(1)); // backprop into y + for (int t = seqLength - 1; t >= 0; t--) { + INDArray dy = ps.getRow(t); + dy.getRow(targets.getInt(t)).subi(1); // backprop into y INDArray hs2t = hs2.getRow(t); - INDArray hs2tm1 = t == 0 ? hs12 : hs2.getRow(t - 1); + INDArray hs2tm1 = t == 0 ? hPrev2 : hs2.getRow(t - 1); dWh2y.addi(dy.mmul(hs2t.transpose())); dby.addi(dy); INDArray dh2 = wh2y.transpose().mmul(dy).add(dh2Next); // backprop into h2 - INDArray dhraw2 = (Nd4j.ones(hs2t.shape()).sub(hs2t.mul(hs2t))).mul(dh2); // backprop through tanh nonlinearity dbh2.addi(dhraw2); INDArray hst = hs.getRow(t); @@ -242,29 +236,28 @@ public class StackedRNN extends RNN { dWhh2.addi(dhraw2.mmul(hs2tm1.transpose())); dh2Next = whh2.transpose().mmul(dhraw2); - INDArray dh = whh2.transpose().mmul(dhraw2).add(dhNext); // backprop into h + INDArray dh = dh2Next.add(dhNext); // backprop into h INDArray dhraw = (Nd4j.ones(hst.shape()).sub(hst.mul(hst))).mul(dh); // backprop through tanh nonlinearity dbh.addi(dhraw); - dWxh.addi(dhraw.mmul(xs.getRow(t))); - INDArray hsRow = t == 0 ? hs1 : hs.getRow(t - 1); + INDArray hsRow = t == 0 ? hPrev : hs.getRow(t - 1); dWhh.addi(dhraw.mmul(hsRow.transpose())); dhNext = whh.transpose().mmul(dhraw); } - this.hPrev = hs.getRow(inputs.length() - 1); - this.hPrev2 = hs2.getRow(inputs.length() - 1); + this.hPrev = hs.getRow(seqLength - 1); + this.hPrev2 = hs2.getRow(seqLength - 1); // clip exploding gradients int clip = 5; - Nd4j.getExecutioner().execAndReturn(new SetRange(dWxh, -clip, clip)); - Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh, -clip, clip)); - Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh2, -clip, clip)); - Nd4j.getExecutioner().execAndReturn(new SetRange(dWh2y, -clip, clip)); - Nd4j.getExecutioner().execAndReturn(new SetRange(dbh, -clip, clip)); - Nd4j.getExecutioner().execAndReturn(new SetRange(dbh2, -clip, clip)); - Nd4j.getExecutioner().execAndReturn(new SetRange(dby, -clip, clip)); + Nd4j.getExecutioner().exec(new SetRange(dWxh, -clip, clip)); + Nd4j.getExecutioner().exec(new SetRange(dWhh, -clip, clip)); + Nd4j.getExecutioner().exec(new SetRange(dWhh2, -clip, clip)); + Nd4j.getExecutioner().exec(new SetRange(dWh2y, -clip, clip)); + Nd4j.getExecutioner().exec(new SetRange(dbh, -clip, clip)); + Nd4j.getExecutioner().exec(new SetRange(dbh2, -clip, clip)); + Nd4j.getExecutioner().exec(new SetRange(dby, -clip, clip)); return loss; } @@ -280,8 +273,8 @@ public class StackedRNN extends RNN { int sampleSize = seqLength * 2; INDArray ixes = Nd4j.create(sampleSize); - INDArray h = hPrev.dup(); - INDArray h2 = hPrev2.dup(); + INDArray h = hPrev; + INDArray h2 = hPrev2; for (int t = 0; t < sampleSize; t++) { h = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(h)).add(bh))); Modified: labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt?rev=1769529&r1=1769528&r2=1769529&view=diff ============================================================================== --- labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt (original) +++ labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt Sun Nov 13 17:54:46 2016 @@ -137,4 +137,8 @@ We employ a character-level convolutiona Our character-to-character model outperforms a recently proposed baseline with a subwordlevel encoder on WMTâ15 DE-EN and CSEN , and gives comparable performance on FIEN and RU-EN . We then demonstrate that it is possible to share a single characterlevel encoder across multiple languages by training a model on a many-to-one translation task . In this multilingual setting , the character-level encoder significantly outperforms the subword-level encoder on all the language pairs . -We observe that on CS-EN , FI-EN and RU-EN , the quality of the multilingual character-level translation even surpasses the models specifically trained on that language pair alone , both in terms of BLEU score and human judgment . \ No newline at end of file +We observe that on CS-EN , FI-EN and RU-EN , the quality of the multilingual character-level translation even surpasses the models specifically trained on that language pair alone , both in terms of BLEU score and human judgment . +The Teacher Forcing algorithm trains recurrent networks by supplying observed sequence values as inputs during training and using the networkâs own one-step- ahead predictions to do multi-step sampling . +We introduce the Professor Forcing algorithm , which uses adversarial domain adaptation to encourage the dynamics of the recurrent network to be the same when training the network and when sampling from the network over multiple time steps . +We apply Professor Forcing to language modeling , vocal synthesis on raw waveforms , handwriting generation , and image generation . +Empirically we find that Professor Forcing acts as a regularizer , improving test likelihood on character level Penn Treebank and sequential MNIST. We also find that the model qualitatively improves samples, especially when sam- pling for a large number of time steps. This is supported by human evaluation of sample quality. Trade-offs between Professor Forcing and Scheduled Sampling are discussed. We produce T-SNEs showing that Professor Forcing successfully makes the dynamics of the network during training and sampling more similar. \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@labs.apache.org For additional commands, e-mail: commits-h...@labs.apache.org