Author: tommaso Date: Wed Oct 7 10:55:37 2015 New Revision: 1707236 URL: http://svn.apache.org/viewvc?rev=1707236&view=rev Log: added iterations parameter to backprop, enhanced word2vec test
Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/core/BackPropagationLearningStrategy.java labs/yay/trunk/core/src/test/java/org/apache/yay/core/Word2VecTest.java labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/core/BackPropagationLearningStrategy.java URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/core/BackPropagationLearningStrategy.java?rev=1707236&r1=1707235&r2=1707236&view=diff ============================================================================== --- labs/yay/trunk/core/src/main/java/org/apache/yay/core/BackPropagationLearningStrategy.java (original) +++ labs/yay/trunk/core/src/main/java/org/apache/yay/core/BackPropagationLearningStrategy.java Wed Oct 7 10:55:37 2015 @@ -40,7 +40,7 @@ public class BackPropagationLearningStra public static final double DEFAULT_THRESHOLD = 0.05; public static final int MAX_ITERATIONS = 100000; - public static final double DEFAULT_ALPHA = 0.000003; + public static final double DEFAULT_ALPHA = 0.0000003; private final PredictionStrategy<Double, Double> predictionStrategy; private final CostFunction<RealMatrix, Double, Double> costFunction; @@ -106,11 +106,13 @@ public class BackPropagationLearningStra // calculate cost double newCost = costFunction.calculateAggregatedCost(samples, hypothesis); - if (newCost > cost && batch == -1) { + if (Double.POSITIVE_INFINITY == newCost || newCost > cost && batch == -1) { throw new RuntimeException("failed to converge at iteration " + iterations + " with alpha " + alpha + " : cost going from " + cost + " to " + newCost); } else if (iterations > 1 && (cost == newCost || newCost < threshold || iterations > maxIterations)) { System.out.println("successfully converged after " + (iterations - 1) + " iterations (alpha:" + alpha + ",threshold:" + threshold + ") with cost " + newCost + " and parameters " + Arrays.toString(hypothesis.getParameters())); break; + } else if (Double.isNaN(newCost)){ + throw new RuntimeException("failed to converge at iteration " + iterations + " with alpha " + alpha + " : cost calculation underflow"); } // update registered cost Modified: labs/yay/trunk/core/src/test/java/org/apache/yay/core/Word2VecTest.java URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/java/org/apache/yay/core/Word2VecTest.java?rev=1707236&r1=1707235&r2=1707236&view=diff ============================================================================== --- labs/yay/trunk/core/src/test/java/org/apache/yay/core/Word2VecTest.java (original) +++ labs/yay/trunk/core/src/test/java/org/apache/yay/core/Word2VecTest.java Wed Oct 7 10:55:37 2015 @@ -19,9 +19,14 @@ package org.apache.yay.core; import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.ObjectOutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -31,7 +36,9 @@ import java.util.List; import java.util.Random; import org.apache.commons.math3.linear.Array2DRowRealMatrix; +import org.apache.commons.math3.linear.MatrixUtils; import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.SingularValueDecomposition; import org.apache.yay.Feature; import org.apache.yay.Input; import org.apache.yay.NeuralNetwork; @@ -58,69 +65,87 @@ public class Word2VecTest { Collection<String> fragments = getFragments(sentences, 4); assertFalse(fragments.isEmpty()); + // TODO : make it possible to define the no. of hidden units + // int n = new Random().nextInt(20); TrainingSet<Double, Double> trainingSet = createTrainingSet(vocabulary, fragments); -// int n = new Random().nextInt(20); - TrainingExample<Double, Double> next = trainingSet.iterator().next(); - int inputSize = next.getFeatures().size(); + int inputSize = next.getFeatures().size() ; int outputSize = next.getOutput().length; RealMatrix[] randomWeights = createRandomWeights(inputSize, inputSize, outputSize); FeedForwardStrategy predictionStrategy = new FeedForwardStrategy(new IdentityActivationFunction<Double>()); BackPropagationLearningStrategy learningStrategy = new BackPropagationLearningStrategy(BackPropagationLearningStrategy. DEFAULT_ALPHA, -1, BackPropagationLearningStrategy.DEFAULT_THRESHOLD, predictionStrategy, new LMSCostFunction(), - 5); + 10); NeuralNetwork neuralNetwork = NeuralNetworkFactory.create(randomWeights, learningStrategy, predictionStrategy); neuralNetwork.learn(trainingSet); - String word = vocabulary.get(new Random().nextInt(vocabulary.size())); -// final Double[] doubles = ConversionUtils.toValuesCollection(next.getFeatures()).toArray(new Double[next.getFeatures().size()]); - final Double[] doubles = hotEncode(word, vocabulary); -// String word = hotDecode(doubles, vocabulary); - -// TrainingExample<Double, Double> input = ExamplesFactory.createDoubleArrayTrainingExample(new Double[outputSize], doubles); - Input<Double> input = new TrainingExample<Double, Double>() { - @Override - public ArrayList<Feature<Double>> getFeatures() { - ArrayList<Feature<Double>> features = new ArrayList<Feature<Double>>(); - for (Double d : doubles) { - Feature<Double> f = new Feature<Double>(); - f.setValue(d); - features.add(f); + RealMatrix vectorsMatrix = MatrixUtils.createRealMatrix(next.getFeatures().size(), next.getOutput().length); + + BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(new File("target/vectors.txt"))); + int m = 0; + for (String word : vocabulary) { + final Double[] doubles = hotEncode(word, vocabulary); + Input<Double> input = new TrainingExample<Double, Double>() { + @Override + public ArrayList<Feature<Double>> getFeatures() { + ArrayList<Feature<Double>> features = new ArrayList<Feature<Double>>(); + Feature<Double> byasFeature = new Feature<Double>(); + byasFeature.setValue(1d); + features.add(byasFeature); + for (Double d : doubles) { + Feature<Double> f = new Feature<Double>(); + f.setValue(d); + features.add(f); + } + return features; } - return features; - } - @Override - public Double[] getOutput() { - return new Double[0]; + @Override + public Double[] getOutput() { + return new Double[0]; + } + }; + Double[] predict = neuralNetwork.predict(input); + assertNotNull(predict); + double[] row = new double[predict.length]; + for (int x = 0; x < row.length; x++) { + row[x] = predict[x]; } - }; - Double[] predict = neuralNetwork.predict(input); - assertNotNull(predict); - - System.out.println(Arrays.toString(predict)); - - Double[] wordVec1 = Arrays.copyOfRange(predict, 0, vocabulary.size()); - assertNotNull(wordVec1); - Double[] wordVec2 = Arrays.copyOfRange(predict, vocabulary.size(), 2 * vocabulary.size()); - assertNotNull(wordVec2); - Double[] wordVec3 = Arrays.copyOfRange(predict, 2 * vocabulary.size(), 3 * vocabulary.size()); - assertNotNull(wordVec3); - - String word1 = hotDecode(wordVec1, vocabulary); - assertNotNull(word1); - assertTrue(vocabulary.contains(word1)); - String word2 = hotDecode(wordVec2, vocabulary); - assertNotNull(word2); - assertTrue(vocabulary.contains(word2)); - String word3 = hotDecode(wordVec3, vocabulary); - assertNotNull(word3); - assertTrue(vocabulary.contains(word3)); + vectorsMatrix.setRow(m, row); + m++; + + String vectorString = Arrays.toString(predict); + bufferedWriter.append(vectorString); + bufferedWriter.newLine(); + + Double[] wordVec1 = Arrays.copyOfRange(predict, 0, vocabulary.size()); + assertNotNull(wordVec1); + Double[] wordVec2 = Arrays.copyOfRange(predict, vocabulary.size(), 2 * vocabulary.size()); + assertNotNull(wordVec2); + Double[] wordVec3 = Arrays.copyOfRange(predict, 2 * vocabulary.size(), 3 * vocabulary.size()); + assertNotNull(wordVec3); + + String word1 = hotDecode(wordVec1, vocabulary); + assertNotNull(word1); + assertTrue(vocabulary.contains(word1)); + String word2 = hotDecode(wordVec2, vocabulary); + assertNotNull(word2); + assertTrue(vocabulary.contains(word2)); + String word3 = hotDecode(wordVec3, vocabulary); + assertNotNull(word3); + assertTrue(vocabulary.contains(word3)); + + System.out.println(word + " -> " + word1 + " " + word2 + " " + word3); + } + bufferedWriter.flush(); + bufferedWriter.close(); + + ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(new File("target/vectors.bin"))); + MatrixUtils.serializeRealMatrix(vectorsMatrix, os); - System.out.println(word + " -> " + word1 + " " + word2 + " " + word3); } private String hotDecode(Double[] doubles, List<String> vocabulary) { @@ -136,7 +161,6 @@ public class Word2VecTest { return vocabulary.get(index); } - private TrainingSet<Double, Double> createTrainingSet(List<String> vocabulary, Collection<String> fragments) { Collection<TrainingExample<Double, Double>> samples = new LinkedList<TrainingExample<Double, Double>>(); for (String fragment : fragments) { @@ -170,6 +194,9 @@ public class Word2VecTest { @Override public ArrayList<Feature<Double>> getFeatures() { ArrayList<Feature<Double>> features = new ArrayList<Feature<Double>>(); + Feature<Double> byasFeature = new Feature<Double>(); + byasFeature.setValue(1d); + features.add(byasFeature); for (Double d : input) { Feature<Double> e = new Feature<Double>(); e.setValue(d); @@ -283,4 +310,4 @@ public class Word2VecTest { } return initialWeights; } -} +} \ No newline at end of file Modified: labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt URL: http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt?rev=1707236&r1=1707235&r2=1707236&view=diff ============================================================================== --- labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt (original) +++ labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt Wed Oct 7 10:55:37 2015 @@ -2,14 +2,14 @@ The word2vec software of Tomas Mikolov a The learning models behind the software are described in two research papers We found the description of the models in these papers to be somewhat cryptic and hard to follow While the motivations and presentation may be obvious to the neural-networks language-modeling crowd we had to struggle quite a bit to figure out the rationale behind the equations -This note is an attempt to explain the negative sampling equation in âDistributed Representations of Words and Phrases and their Compositionalityâ by Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado and Jeffrey Dean +This note is an attempt to explain the negative sampling equation in âDistributed Representations of Words and Phrases and their Compositionalityâ by Tomas Mikolov Ilya Sutskever Kai Chen Greg Corrado and Jeffrey Dean The departure point of the paper is the skip-gram model In this model we are given a corpus of words w and their contexts c -We consider the conditional probabilities p(c|w) and given a corpus Text, the goal is to set the parameters θ of p(c|w;θ) so as to maximize the corpus probability +We consider the conditional probabilities p(c|w) and given a corpus Text the goal is to set the parameters θ of p(c|w;θ) so as to maximize the corpus probability The recently introduced continuous Skip-gram model is an efficient method for learning high-quality distributed vector representations that capture a large number of precise syntactic and semantic word relationships In this paper we present several extensions that improve both the quality of the vectors and the training speed By subsampling of the frequent words we obtain significant speedup and also learn more regular word representations We also describe a simple alternative to the hierarchical softmax called negative sampling An inherent limitation of word representations is their indifference to word order and their inability to represent idiomatic phrases -For example, the meanings of âCanadaâ and âAirâ cannot be easily combined to obtain âAir Canadaâ -Motivated by this example, we present a simple method for finding phrases in text and show that learning good vector representations for millions of phrases is possible \ No newline at end of file +For example the meanings of âCanadaâ and âAirâ cannot be easily combined to obtain âAir Canadaâ +Motivated by this example we present a simple method for finding phrases in text and show that learning good vector representations for millions of phrases is possible \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@labs.apache.org For additional commands, e-mail: commits-h...@labs.apache.org