Author: tommaso
Date: Sun Nov 13 17:54:46 2016
New Revision: 1769529

URL: http://svn.apache.org/viewvc?rev=1769529&view=rev
Log:
minor perf improvements

Modified:
    labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java
    labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java
    labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java
URL: 
http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java?rev=1769529&r1=1769528&r2=1769529&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/RNN.java Sun Nov 13 
17:54:46 2016
@@ -100,8 +100,8 @@ public class RNN {
     wxh = Nd4j.randn(hiddenLayerSize, vocabSize).mul(0.01);
     whh = Nd4j.randn(hiddenLayerSize, hiddenLayerSize).mul(0.01);
     why = Nd4j.randn(vocabSize, hiddenLayerSize).mul(0.01);
-    bh = Nd4j.zeros(hiddenLayerSize, 1).mul(0.01);
-    by = Nd4j.zeros(vocabSize, 1).mul(0.01);
+    bh = Nd4j.zeros(hiddenLayerSize, 1);
+    by = Nd4j.zeros(vocabSize, 1);
   }
 
   private String[] toStrings(char[] chars) {
@@ -248,7 +248,7 @@ public class RNN {
     // backward pass: compute gradients going backwards
     INDArray dhNext = Nd4j.zerosLike(hs.getRow(0));
     for (int t = inputs.length() - 1; t >= 0; t--) {
-      INDArray dy = ps.getRow(t).dup();
+      INDArray dy = ps.getRow(t);
       dy.putRow(targets.getInt(t), dy.getRow(targets.getInt(t)).sub(1)); // 
backprop into y
       INDArray hst = hs.getRow(t);
       dWhy.addi(dy.mmul(hst.transpose())); // derivative of hy layer
@@ -263,11 +263,11 @@ public class RNN {
     }
 
     // clip exploding gradients
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWxh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhy, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh, -5, 5));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dby, -5, 5));
+    Nd4j.getExecutioner().exec(new SetRange(dWxh, -5, 5));
+    Nd4j.getExecutioner().exec(new SetRange(dWhh, -5, 5));
+    Nd4j.getExecutioner().exec(new SetRange(dWhy, -5, 5));
+    Nd4j.getExecutioner().exec(new SetRange(dbh, -5, 5));
+    Nd4j.getExecutioner().exec(new SetRange(dby, -5, 5));
 
     return loss;
   }
@@ -292,11 +292,9 @@ public class RNN {
     int sampleSize = 2 * seqLength;
     INDArray ixes = Nd4j.create(sampleSize);
 
-    INDArray h = hPrev.dup();
-
     for (int t = 0; t < sampleSize; t++) {
-      h = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(h)).add(bh)));
-      INDArray y = (why.mmul(h)).add(by);
+      hPrev = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(hPrev)).add(bh)));
+      INDArray y = (why.mmul(hPrev)).add(by);
       INDArray pm = Nd4j.getExecutioner().execAndReturn(new 
SoftMax(y)).ravel();
 
       List<Pair<Integer, Double>> d = new LinkedList<>();

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java
URL: 
http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java?rev=1769529&r1=1769528&r2=1769529&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java (original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/StackedRNN.java Sun Nov 13 
17:54:46 2016
@@ -70,9 +70,9 @@ public class StackedRNN extends RNN {
     whh = Nd4j.randn(hiddenLayerSize, hiddenLayerSize).mul(0.01);
     whh2 = Nd4j.randn(hiddenLayerSize, hiddenLayerSize).mul(0.01);
     wh2y = Nd4j.randn(vocabSize, hiddenLayerSize).mul(0.01);
-    bh = Nd4j.zeros(hiddenLayerSize, 1).mul(0.01);
-    bh2 = Nd4j.zeros(hiddenLayerSize, 1).mul(0.01);
-    by = Nd4j.zeros(vocabSize, 1).mul(0.01);
+    bh = Nd4j.zeros(hiddenLayerSize, 1);
+    bh2 = Nd4j.zeros(hiddenLayerSize, 1);
+    by = Nd4j.zeros(vocabSize, 1);
   }
 
   public void learn() {
@@ -112,7 +112,7 @@ public class StackedRNN extends RNN {
       INDArray targets = getSequence(p + 1);
 
       // sample from the model every now and then
-      if (n % 100 == 0 && n > 0) {
+      if (n % 1000 == 0 && n > 0) {
         String txt = sample(inputs.getInt(0));
         System.out.printf("\n---\n %s \n----\n", txt);
       }
@@ -172,69 +172,63 @@ public class StackedRNN extends RNN {
   private double lossFun(INDArray inputs, INDArray targets, INDArray dWxh, 
INDArray dWhh, INDArray dWhh2, INDArray dWh2y,
                          INDArray dbh, INDArray dbh2, INDArray dby) {
 
-    INDArray xs = Nd4j.zeros(inputs.length(), vocabSize);
+    INDArray xs = Nd4j.zeros(seqLength, vocabSize);
     INDArray hs = null;
     INDArray hs2 = null;
     INDArray ys = null;
     INDArray ps = null;
 
-    INDArray hs1 = hPrev.dup();
-    INDArray hs12 = hPrev2.dup();
-
     double loss = 0;
 
     // forward pass
-    for (int t = 0; t < inputs.length(); t++) {
+    for (int t = 0; t < seqLength; t++) {
       int tIndex = inputs.getScalar(t).getInt(0);
       xs.putScalar(t, tIndex, 1); // encode in 1-of-k representation
 
-      INDArray hsRow = t == 0 ? hs1 : hs.getRow(t - 1);
+      INDArray hsRow = t == 0 ? hPrev : hs.getRow(t - 1);
       INDArray xst = xs.getRow(t);
       INDArray hst = 
Transforms.tanh((wxh.mmul(xst.transpose())).add((whh.mmul(hsRow)).add(bh))); // 
hidden state
       if (hs == null) {
-        hs = init(inputs.length(), hst);
+        hs = init(seqLength, hst);
       }
       hs.putRow(t, hst);
 
-      INDArray hs2Row = t == 0 ? hs12 : hs2.getRow(t - 1);
+      INDArray hs2Row = t == 0 ? hPrev2 : hs2.getRow(t - 1);
       INDArray hst2 = 
Transforms.tanh((whh.mmul(hst)).add((whh2.mmul(hs2Row)).add(bh2))); // hidden 
state 2
       if (hs2 == null) {
-        hs2 = init(inputs.length(), hst2);
+        hs2 = init(seqLength, hst2);
       }
       hs2.putRow(t, hst2);
 
       INDArray yst = (wh2y.mmul(hst2)).add(by); // unnormalized log 
probabilities for next chars
       if (ys == null) {
-        ys = init(inputs.length(), yst);
+        ys = init(seqLength, yst);
       }
       ys.putRow(t, yst);
 
       INDArray pst = Nd4j.getExecutioner().execAndReturn(new SoftMax(yst)); // 
probabilities for next chars
       if (ps == null) {
-        ps = init(inputs.length(), pst);
+        ps = init(seqLength, pst);
       }
       ps.putRow(t, pst);
 
-      int targetsInt = targets.getInt(t);
-      loss += -Math.log(pst.getDouble(targetsInt)); // softmax (cross-entropy 
loss)
+      loss += -Math.log(pst.getDouble(targets.getInt(t))); // softmax 
(cross-entropy loss)
     }
 
     // backward pass: compute gradients going backwards
     INDArray dhNext = Nd4j.zerosLike(hs.getRow(0));
     INDArray dh2Next = Nd4j.zerosLike(hs2.getRow(0));
-    for (int t = inputs.length() - 1; t >= 0; t--) {
-
-      INDArray dy = ps.getRow(t).dup();
-      dy.putRow(targets.getInt(t), dy.getRow(targets.getInt(t)).sub(1)); // 
backprop into y
+    for (int t = seqLength - 1; t >= 0; t--) {
+      INDArray dy = ps.getRow(t);
+      dy.getRow(targets.getInt(t)).subi(1); // backprop into y
 
       INDArray hs2t = hs2.getRow(t);
-      INDArray hs2tm1 = t == 0 ? hs12 : hs2.getRow(t - 1);
+      INDArray hs2tm1 = t == 0 ? hPrev2 : hs2.getRow(t - 1);
 
       dWh2y.addi(dy.mmul(hs2t.transpose()));
       dby.addi(dy);
 
       INDArray dh2 = wh2y.transpose().mmul(dy).add(dh2Next); // backprop into 
h2
-
       INDArray dhraw2 = 
(Nd4j.ones(hs2t.shape()).sub(hs2t.mul(hs2t))).mul(dh2); //  backprop through 
tanh nonlinearity
       dbh2.addi(dhraw2);
       INDArray hst = hs.getRow(t);
@@ -242,29 +236,28 @@ public class StackedRNN extends RNN {
       dWhh2.addi(dhraw2.mmul(hs2tm1.transpose()));
       dh2Next = whh2.transpose().mmul(dhraw2);
 
-      INDArray dh = whh2.transpose().mmul(dhraw2).add(dhNext); // backprop 
into h
+      INDArray dh = dh2Next.add(dhNext); // backprop into h
       INDArray dhraw = (Nd4j.ones(hst.shape()).sub(hst.mul(hst))).mul(dh); // 
backprop through tanh nonlinearity
       dbh.addi(dhraw);
-
       dWxh.addi(dhraw.mmul(xs.getRow(t)));
-      INDArray hsRow = t == 0 ? hs1 : hs.getRow(t - 1);
+      INDArray hsRow = t == 0 ? hPrev : hs.getRow(t - 1);
       dWhh.addi(dhraw.mmul(hsRow.transpose()));
       dhNext = whh.transpose().mmul(dhraw);
 
     }
 
-    this.hPrev = hs.getRow(inputs.length() - 1);
-    this.hPrev2 = hs2.getRow(inputs.length() - 1);
+    this.hPrev = hs.getRow(seqLength - 1);
+    this.hPrev2 = hs2.getRow(seqLength - 1);
 
     // clip exploding gradients
     int clip = 5;
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWxh, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWhh2, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dWh2y, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dbh2, -clip, clip));
-    Nd4j.getExecutioner().execAndReturn(new SetRange(dby, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dWxh, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dWhh, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dWhh2, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dWh2y, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dbh, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dbh2, -clip, clip));
+    Nd4j.getExecutioner().exec(new SetRange(dby, -clip, clip));
 
     return loss;
   }
@@ -280,8 +273,8 @@ public class StackedRNN extends RNN {
     int sampleSize = seqLength * 2;
     INDArray ixes = Nd4j.create(sampleSize);
 
-    INDArray h = hPrev.dup();
-    INDArray h2 = hPrev2.dup();
+    INDArray h = hPrev;
+    INDArray h2 = hPrev2;
 
     for (int t = 0; t < sampleSize; t++) {
       h = Transforms.tanh((wxh.mmul(x)).add((whh.mmul(h)).add(bh)));

Modified: labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt
URL: 
http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt?rev=1769529&r1=1769528&r2=1769529&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt (original)
+++ labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt Sun Nov 13 
17:54:46 2016
@@ -137,4 +137,8 @@ We employ a character-level convolutiona
 Our character-to-character model outperforms a recently proposed baseline with 
a subwordlevel encoder on WMT’15 DE-EN and CSEN , and gives comparable 
performance on FIEN and RU-EN .
 We then demonstrate that it is possible to share a single characterlevel 
encoder across multiple languages by training a model on a many-to-one 
translation task .
 In this multilingual setting , the character-level encoder significantly 
outperforms the subword-level encoder on all the language pairs .
-We observe that on CS-EN , FI-EN and RU-EN , the quality of the multilingual 
character-level translation even surpasses the models specifically trained on 
that language pair alone , both in terms of BLEU score and human judgment .
\ No newline at end of file
+We observe that on CS-EN , FI-EN and RU-EN , the quality of the multilingual 
character-level translation even surpasses the models specifically trained on 
that language pair alone , both in terms of BLEU score and human judgment .
+The Teacher Forcing algorithm trains recurrent networks by supplying observed 
sequence values as inputs during training and using the network’s own 
one-step- ahead predictions to do multi-step sampling .
+We introduce the Professor Forcing algorithm , which uses adversarial domain 
adaptation to encourage the dynamics of the recurrent network to be the same 
when training the network and when sampling from the network over multiple time 
steps .
+We apply Professor Forcing to language modeling , vocal synthesis on raw 
waveforms , handwriting generation , and image generation .
+Empirically we find that Professor Forcing acts as a regularizer , improving 
test likelihood on character level Penn Treebank and sequential MNIST. We also 
find that the model qualitatively improves samples, especially when sam- pling 
for a large number of time steps. This is supported by human evaluation of 
sample quality. Trade-offs between Professor Forcing and Scheduled Sampling are 
discussed. We produce T-SNEs showing that Professor Forcing successfully makes 
the dynamics of the network during training and sampling more similar.
\ No newline at end of file



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@labs.apache.org
For additional commands, e-mail: commits-h...@labs.apache.org

Reply via email to