Author: tommaso Date: Sun Oct 9 06:32:50 2016 New Revision: 1763945 URL: http://svn.apache.org/viewvc?rev=1763945&view=rev Log: rnn loosing memory of previous hidden state - fixed
Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/CharRNN.java labs/yay/trunk/core/src/main/java/org/apache/yay/WordRNN.java labs/yay/trunk/core/src/test/java/org/apache/yay/CharRNNCrossValidationTest.java labs/yay/trunk/core/src/test/java/org/apache/yay/WordRNNCrossValidationTest.java labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/CharRNN.java URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/CharRNN.java?rev=1763945&r1=1763944&r2=1763945&view=diff ============================================================================== --- labs/yay/trunk/core/src/main/java/org/apache/yay/CharRNN.java (original) +++ labs/yay/trunk/core/src/main/java/org/apache/yay/CharRNN.java Sun Oct 9 06:32:50 2016 @@ -224,6 +224,8 @@ public class CharRNN { loss += -Transforms.log(ps.getRow(t).getRow(targets.getInt(t)), true).sumNumber().doubleValue(); // softmax (cross-entropy loss) } + this.hPrev = hs.getRow(inputs.length() - 1); + // backward pass: compute gradients going backwards INDArray dhNext = Nd4j.zerosLike(hs.getRow(0)); for (int t = inputs.length() - 1; t >= 0; t--) { Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/WordRNN.java URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/WordRNN.java?rev=1763945&r1=1763944&r2=1763945&view=diff ============================================================================== --- labs/yay/trunk/core/src/main/java/org/apache/yay/WordRNN.java (original) +++ labs/yay/trunk/core/src/main/java/org/apache/yay/WordRNN.java Sun Oct 9 06:32:50 2016 @@ -142,7 +142,7 @@ public class WordRNN { System.out.println("loss is NaN (over/underflow occured, try adjusting hyperparameters)"); break; } - if (n % 1000 == 0) { + if (n % 100 == 0) { System.out.printf("iter %d, loss: %f\n", n, smoothLoss); // print progress } @@ -223,6 +223,8 @@ public class WordRNN { loss += -Transforms.log(ps.getRow(t).getRow(targets.getInt(t)), true).sumNumber().doubleValue(); // softmax (cross-entropy loss) } + this.hPrev = hs.getRow(inputs.length() - 1); + // backward pass: compute gradients going backwards INDArray dhNext = Nd4j.zerosLike(hs.getRow(0)); for (int t = inputs.length() - 1; t >= 0; t--) { Modified: labs/yay/trunk/core/src/test/java/org/apache/yay/CharRNNCrossValidationTest.java URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/java/org/apache/yay/CharRNNCrossValidationTest.java?rev=1763945&r1=1763944&r2=1763945&view=diff ============================================================================== --- labs/yay/trunk/core/src/test/java/org/apache/yay/CharRNNCrossValidationTest.java (original) +++ labs/yay/trunk/core/src/test/java/org/apache/yay/CharRNNCrossValidationTest.java Sun Oct 9 06:32:50 2016 @@ -53,6 +53,7 @@ public class CharRNNCrossValidationTest {1e-1f, 25, 100}, {1e-1f, 200, 50}, {1e-1f, 200, 40}, {1e-1f, 100, 30}, {1e-1f, 100, 20}, {1e-1f, 250, 20}, {1e-1f, 250, 15}, {1e-2f, 50, 64}, {3e-2f, 50, 128}, {1e-2f, 100, 128}, {1e-2f, 100, 256}, {1e-2f, 100, 512}, {1e-2f, 100, 128}, {1e-3f, 100, 256}, {1e-3f, 100, 512}, {1e-4f, 100, 128}, {1e-4f, 100, 256}, + {1e-3f, 100, 100}, }); } @@ -61,7 +62,7 @@ public class CharRNNCrossValidationTest System.out.println("hyperparameters: " + learningRate + ", " + seqLength + ", " + hiddenLayerSize); InputStream resourceAsStream = getClass().getResourceAsStream("/word2vec/abstracts.txt"); String text = IOUtils.toString(resourceAsStream); - int epochs = 20; + int epochs = 1000000; CharRNN charRNN = new CharRNN(learningRate, seqLength, hiddenLayerSize, epochs, text); List<String> words = Arrays.asList(text.split(" ")); charRNN.learn(); Modified: labs/yay/trunk/core/src/test/java/org/apache/yay/WordRNNCrossValidationTest.java URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/java/org/apache/yay/WordRNNCrossValidationTest.java?rev=1763945&r1=1763944&r2=1763945&view=diff ============================================================================== --- labs/yay/trunk/core/src/test/java/org/apache/yay/WordRNNCrossValidationTest.java (original) +++ labs/yay/trunk/core/src/test/java/org/apache/yay/WordRNNCrossValidationTest.java Sun Oct 9 06:32:50 2016 @@ -53,7 +53,7 @@ public class WordRNNCrossValidationTest {1e-1f, 25, 100}, {1e-1f, 200, 50}, {1e-1f, 200, 40}, {1e-1f, 100, 30}, {1e-1f, 100, 20}, {1e-1f, 250, 20}, {1e-1f, 250, 15}, {1e-2f, 50, 64}, {3e-2f, 50, 128}, {1e-2f, 100, 128}, {1e-2f, 100, 256}, {1e-2f, 100, 512}, {1e-2f, 100, 128}, {1e-3f, 100, 256}, {1e-3f, 100, 512}, {1e-4f, 100, 128}, {1e-4f, 100, 256}, - {1e-4f, 200, 1000}, + {2e-1f, 25, 100}, }); } @@ -62,23 +62,13 @@ public class WordRNNCrossValidationTest System.out.println("hyperparameters: " + learningRate + ", " + seqLength + ", " + hiddenLayerSize); InputStream resourceAsStream = getClass().getResourceAsStream("/word2vec/abstracts.txt"); String text = IOUtils.toString(resourceAsStream); - int epochs = 100; + int epochs = 100000; WordRNN wordRNN = new WordRNN(learningRate, seqLength, hiddenLayerSize, epochs, text); - List<String> words = Arrays.asList(text.split(" ")); wordRNN.learn(); for (int i = 0; i < 10; i++) { - double c = 0; String sample = wordRNN.sample(r.nextInt(wordRNN.getVocabSize())); - String[] sampleWords = sample.split(" "); - for (String sw : sampleWords) { - if (words.contains(sw)) { - c++; - } - } - if (c > 0) { - c /= sample.length(); - } - System.out.println("correct word ratio: " + c); + System.out.println(sample); + System.out.println("***"); } } Modified: labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt?rev=1763945&r1=1763944&r2=1763945&view=diff ============================================================================== --- labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt (original) +++ labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt Sun Oct 9 06:32:50 2016 @@ -3,14 +3,14 @@ With this goal in mind , we present a fo It turns out that the operators developed in this way are precisely the connectives of quantum logic ( Birkhoff and von Neumann , 1936 ) , which to our knowledge have not been exploited before in natural language processing . In quantum logic , arbitrary sets are replaced by linear subspaces of a vector space , and set unions , intersections and complements are replaced by vector sum , intersection and orthogonal complements of subspaces . We demonstrate that these logical connectives ( particularly the orthogonal complement for negation ) are powerful tools for exploring and analysing word meanings and show distinct advantages over Boolean operators in document retrieval experiments . -This paper is organised as follows . +This paper is organised as follows : In Section 1.1 we describe some of the ways vectors have been used to represent the meanings of terms and documents in natural language processing , and describe the way the WORD-SPACE used in our later experiments is built automatically from text corpora . In Section 1.2 we define the logical connectives on vector spaces , focussing particularly on negation and disjunction . This introduces the basic material needed to understand the worked examples given in Section 1.3 , and the document retrieval experiments described in Section 1.3.1 . Section 1.4 gives a much fuller outline of the theory of quantum logic , the natural setting for the operators of Section 1.2 . Finally , in Section 1.5 , we examine the similarities between quantum logic and WORD-SPACE , asking whether quantum logic is an appropriate framework for modelling word-meanings or if the initial successes we have obtained are mainly coincidental . To some extent , this paper may have been written backwards , in that the implementation and examples are at the beginning and most of the theory is at the end . -This is for two reasons . +This is for two reasons : Firstly , we hoped to make the paper as accessible as possible and were afraid that beginning with an introduction to the full machinery of quantum logic would defeat this goal before the reader has a chance to realise that the techniques and equations used in this work are really quite elementary . Secondly , the link with âquantum logicâ was itself only brought to our attention after the bulk of the results in this paper had been obtained , and since this research is very much ongoing , we deemed it appropriate to give an honest account of its history and current state . We propose two novel model architectures for computing continuous vector representations of words from very large data sets The quality of these representations is measured in a word similarity task , and the results are compared to the previously best performing techniques based on different types of neural networks . @@ -73,4 +73,26 @@ The tutorial covers input encoding for n The development of intelligent machines is one of the biggest unsolved challenges in computer science . In this paper , we propose some fundamental properties these machines should have , focusing in particular on communication and learning . We discuss a simple environment that could be used to incrementally teach a machine the basics of natural-language-based communication , as a prerequisite to more complex interaction with human users . -We also present some conjectures on the sort of algorithms the machine should support in order to profitably learn from the environment . \ No newline at end of file +We also present some conjectures on the sort of algorithms the machine should support in order to profitably learn from the environment . +In this work , we present the first results for neuralizing an Unsupervised Hidden Markov Model . +We evaluate our approach on tag induction . +Our approach outperforms existing generative models and is competitive with the state-of-the-art though with a simpler model easily extended to include additional context . +Deep Neural Networks (DNNs) are powerful models that have achieved excellent performance on difficult learning tasks . +Although DNNs work well whenever large labeled training sets are available , they cannot be used to map sequences to sequences . +In this paper, we present a general end-to-end approach to sequence learning that makes minimal assumptions on the sequence structure . +Our method uses a multilayered Long Short-TermMemory (LSTM) to map the input sequence to a vector of a fixed dimensionality , and then another deep LSTM to decode the target sequence from the vector . +Our main result is that on an English to French translation task fromtheWMTâ14 dataset , the translations produced by the LSTM achieve a BLEU score of 34.8 on the entire test set, where the LSTMâs BLEU score was penalized on out-of-vocabulary words . +Additionally , the LSTM did not have difficulty on long sentences . For comparison , a phrase-based SMT system achieves a BLEU score of 33.3 on the same dataset . +When we used the LSTM to rerank the 1000 hypotheses produced by the aforementioned SMT system , its BLEU score increases to 36.5 , which is close to the previous best result on this task. +The LSTM also learned sensible phrase and sentence representations that are sensitive to word order and are relatively invariant to the active and the passive voice. +Finally , we found that reversing the order of the words in all source sentences (but not target sentences) improved the LSTMâs performancemarkedly , because doing so introduced many short term dependencies between the source and the target sentence which made the optimization problem easier . +We combine Riemannian geometry with the mean field theory of high dimensional chaos to study the nature of signal propagation in generic , deep neural networks with random weights . +Our results reveal an order-to-chaos expressivity phase transition , with networks in the chaotic phase computing nonlinear functions whose global curvature grows exponentially with depth but not width . +We prove this generic class of deep random functions cannot be efficiently computed by any shallow network , going beyond prior work restricted to the analysis of single functions . +Moreover , we formalize and quantitatively demonstrate the long conjectured idea that deep networks can disentangle highly curved manifolds in input space into flat manifolds in hidden space . +Our theoretical analysis of the expressive power of deep networks broadly applies to arbitrary nonlinearities , and provides a quantitative underpinning for previously abstract notions about the geometry of deep functions . +In this paper , we propose a novel neural network model called RNN EncoderâDecoder that consists of two recurrent neural networks (RNN) . +One RNN encodes a sequence of symbols into a fixedlength vector representation , and the other decodes the representation into another sequence of symbols . +The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence . +The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN EncoderâDecoder as an additional feature in the existing log-linear model . +Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases . \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@labs.apache.org For additional commands, e-mail: commits-h...@labs.apache.org