Author: tommaso
Date: Fri Mar 11 14:35:40 2016
New Revision: 1734572

URL: http://svn.apache.org/viewvc?rev=1734572&view=rev
Log:
skipgram using configurable mini batch

Modified:
    labs/yay/trunk/core/src/main/java/org/apache/yay/SkipGramNetwork.java
    labs/yay/trunk/core/src/test/java/org/apache/yay/SkipGramNetworkTest.java
    labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt
    labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt

Modified: labs/yay/trunk/core/src/main/java/org/apache/yay/SkipGramNetwork.java
URL: 
http://svn.apache.org/viewvc/labs/yay/trunk/core/src/main/java/org/apache/yay/SkipGramNetwork.java?rev=1734572&r1=1734571&r2=1734572&view=diff
==============================================================================
--- labs/yay/trunk/core/src/main/java/org/apache/yay/SkipGramNetwork.java 
(original)
+++ labs/yay/trunk/core/src/main/java/org/apache/yay/SkipGramNetwork.java Fri 
Mar 11 14:35:40 2016
@@ -48,7 +48,7 @@ import java.util.regex.Pattern;
 
 /**
  * A skip-gram neural network.
- * It learns its weights through backpropagation algorithm via batch gradient 
descent applied to a collection of
+ * It learns its weights through backpropagation algorithm via (configurable) 
mini batch gradient descent applied to a collection of
  * hot encoded training samples.
  */
 public class SkipGramNetwork {
@@ -132,19 +132,19 @@ public class SkipGramNetwork {
     double cc = 0;
     double wc = 0;
     int window = network.configuration.window;
+    List<String> vocabulary = network.getVocabulary();
+    Collection<Integer> exps = new LinkedList<>();
+    Collection<Integer> acts = new LinkedList<>();
     for (Sample sample : network.samples) {
-      Collection<Integer> exps = new ArrayList<>(window - 1);
-      Collection<Integer> acts = new ArrayList<>(window - 1);
       double[] inputs = sample.getInputs();
-      double[] actualOutputs = network.predictOutput(inputs);
-      double[] expectedOutputs = sample.getOutputs();
       int j = 0;
       for (int i = 0; i < window - 1; i++) {
-        int actualMax = getMaxIndex(actualOutputs, j, j + inputs.length - 1);
-        int expectedMax = getMaxIndex(expectedOutputs, j, j + inputs.length - 
1);
-        exps.add(expectedMax % inputs.length);
-        acts.add(actualMax % inputs.length);
-        j += inputs.length;
+        int le = inputs.length;
+        int actualMax = getMaxIndex(network.predictOutput(inputs), j, j + le - 
1);
+        int expectedMax = getMaxIndex(sample.getOutputs(), j, j + le - 1);
+        exps.add(expectedMax % le);
+        acts.add(actualMax % le);
+        j += le;
       }
       boolean c = true;
       for (Integer e : exps) {
@@ -152,7 +152,6 @@ public class SkipGramNetwork {
       }
       if (c) {
         cc++;
-        List<String> vocabulary = network.getVocabulary();
         String x = vocabulary.get(getMaxIndex(inputs, 0, inputs.length));
         StringBuilder y = new StringBuilder();
         for (int e : exps) {
@@ -165,7 +164,9 @@ public class SkipGramNetwork {
       } else {
         wc++;
       }
-
+      acts.clear();
+      exps.clear();
+      if (cc + wc > 2000) break;
     }
     return (cc / (wc + cc));
   }
@@ -210,10 +211,10 @@ public class SkipGramNetwork {
   }
 
 
-  // --- batch gradient descent ---
+  // --- mini batch gradient descent ---
 
   /**
-   * perform weights learning from the training examples using batch gradient 
descent algorithm
+   * perform weights learning from the training examples using (configurable) 
mini batch gradient descent algorithm
    *
    * @param samples the training examples
    * @return the final cost with the updated weights
@@ -225,14 +226,7 @@ public class SkipGramNetwork {
 
     double cost = Double.MAX_VALUE;
 
-    RealMatrix x = MatrixUtils.createRealMatrix(samples.length, 
samples[0].getInputs().length);
-    RealMatrix y = MatrixUtils.createRealMatrix(samples.length, 
samples[0].getOutputs().length);
-    int i = 0;
-    for (Sample sample : samples) {
-      x.setRow(i, ArrayUtils.addAll(sample.getInputs()));
-      y.setRow(i, ArrayUtils.addAll(sample.getOutputs()));
-      i++;
-    }
+    int j = 0;
 
     // momentum
     RealMatrix vb = MatrixUtils.createRealMatrix(biases[0].getRowDimension(), 
biases[0].getColumnDimension());
@@ -241,17 +235,25 @@ public class SkipGramNetwork {
     RealMatrix vw2 = 
MatrixUtils.createRealMatrix(weights[1].getRowDimension(), 
weights[1].getColumnDimension());
 
     long start = System.currentTimeMillis();
+    int c = 1;
     while (true) {
 
-      long time = (System.currentTimeMillis() - start) / 1000;
-      if (iterations % (1 + (configuration.maxIterations / 100)) == 0 || time 
% 300 < 2) {
-        if (time > 60) {
-          System.out.println("cost is " + cost + " after " + iterations + " 
iterations in " + (time / 60) + " minutes (" + ((double) iterations / time) + " 
ips)");
-        }
+      RealMatrix x = MatrixUtils.createRealMatrix(configuration.batchSize, 
samples[0].getInputs().length);
+      RealMatrix y = MatrixUtils.createRealMatrix(configuration.batchSize, 
samples[0].getOutputs().length);
+      int i = 0;
+      for (int k = j * configuration.batchSize; k < j * 
configuration.batchSize + configuration.batchSize; k++) {
+        Sample sample = samples[k % samples.length];
+        x.setRow(i, ArrayUtils.addAll(sample.getInputs()));
+        y.setRow(i, ArrayUtils.addAll(sample.getOutputs()));
+        i++;
       }
 
-      if (iterations % 100 == 0) {
-        System.out.println("accuracy: " + evaluate(this));
+      long time = (System.currentTimeMillis() - start) / 1000;
+      if (iterations % (1 + (configuration.maxIterations / 100)) == 0 || time 
% 300 == 0) {
+        if (time > 60 * c) {
+          c += 1;
+          System.out.println("cost: " + cost + ", accuracy: " + evaluate(this) 
+ " after " + iterations + " iterations in " + (time / 60) + " minutes (" + 
((double) iterations / time) + " ips)");
+        }
       }
 
       RealMatrix w0t = weights[0].transpose();
@@ -933,6 +935,7 @@ public class SkipGramNetwork {
     protected int window;
     protected boolean useMomentum;
     protected boolean useNesterovMomentum;
+    protected int batchSize;
   }
 
   public static class Builder {
@@ -942,6 +945,10 @@ public class SkipGramNetwork {
       this.configuration = new Configuration();
     }
 
+    public Builder withBatchSize(int batchSize) {
+      this.configuration.batchSize = batchSize;
+      return this;
+    }
 
     public Builder withWindow(int w) {
       this.configuration.window = w;
@@ -979,7 +986,6 @@ public class SkipGramNetwork {
     }
 
     public Builder useNesterovMomentum(boolean useNesterovMomentum) {
-      this.configuration.useMomentum = false;
       this.configuration.useNesterovMomentum = useNesterovMomentum;
       return this;
     }
@@ -1010,6 +1016,10 @@ public class SkipGramNetwork {
         this.configuration.maxIterations = trainingSet.size() * 100000;
       }
 
+      if (this.configuration.batchSize == 0) {
+        this.configuration.batchSize = trainingSet.size();
+      }
+
       HotEncodedSample next = trainingSet.iterator().next();
 
       this.configuration.inputs = next.getInputs().length;
@@ -1078,6 +1088,20 @@ public class SkipGramNetwork {
 
       Splitter splitter = 
Splitter.on(Pattern.compile("[\\n\\s]")).omitEmptyStrings().trimResults();
 
+      if (Files.isDirectory(path)) {
+        for (Path p : Files.newDirectoryStream(path)) {
+          addFragments(p, w, fragments, splitter);
+        }
+      } else {
+        addFragments(path, w, fragments, splitter);
+      }
+      long end = System.currentTimeMillis();
+      System.out.println("fragments read in " + (end - start) / 60000 + " 
minutes (" + fragments.size() + ")");
+      return fragments;
+
+    }
+
+    private void addFragments(Path path, int w, Queue<List<byte[]>> fragments, 
Splitter splitter) {
       ByteBuffer buffer = ByteBuffer.allocate(1);
       try (SeekableByteChannel inChannel = Files.newByteChannel(path)) {
 
@@ -1117,10 +1141,6 @@ public class SkipGramNetwork {
       } finally {
         buffer.clear();
       }
-      long end = System.currentTimeMillis();
-      System.out.println("fragments read in " + (end - start) / 60000 + " 
minutes (" + fragments.size() + ")");
-      return fragments;
-
     }
 
     private Queue<List<byte[]>> getFragmentsOld(Path path, int w) throws 
IOException {

Modified: 
labs/yay/trunk/core/src/test/java/org/apache/yay/SkipGramNetworkTest.java
URL: 
http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/java/org/apache/yay/SkipGramNetworkTest.java?rev=1734572&r1=1734571&r2=1734572&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/java/org/apache/yay/SkipGramNetworkTest.java 
(original)
+++ labs/yay/trunk/core/src/test/java/org/apache/yay/SkipGramNetworkTest.java 
Fri Mar 11 14:35:40 2016
@@ -47,11 +47,12 @@ public class SkipGramNetworkTest {
             withWindow(3).
             fromTextAt(path).
             withDimension(10).
-            withAlpha(0.0003).
+            withAlpha(0.01).
             withLambda(0.0001).
             useNesterovMomentum(true).
             withMu(0.9).
-            withMaxIterations(500).
+            withMaxIterations(30000).
+            withBatchSize(10).
             build();
     RealMatrix wv = network.getWeights()[0];
     List<String> vocabulary = network.getVocabulary();
@@ -67,11 +68,12 @@ public class SkipGramNetworkTest {
             withWindow(3).
             fromTextAt(path).
             withDimension(10).
-            withAlpha(0.007).
-            withLambda(0.001).
-            useMomentum(true).
-            withMu(0.7).
-            withMaxIterations(500).
+            withAlpha(0.01).
+            withLambda(0.0001).
+            useNesterovMomentum(true).
+            withMu(0.9).
+            withMaxIterations(30000).
+            withBatchSize(1).
             build();
     RealMatrix wv = network.getWeights()[0];
     List<String> vocabulary = network.getVocabulary();

Modified: labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt
URL: 
http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt?rev=1734572&r1=1734571&r2=1734572&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt (original)
+++ labs/yay/trunk/core/src/test/resources/word2vec/abstracts.txt Fri Mar 11 
14:35:40 2016
@@ -2,7 +2,7 @@ A calculus which combined the flexible g
 With this goal in mind , we present a formulation for logical connectives in 
vector spaces based on standard linear algebra , giving examples of the use of 
vector negation to discriminate between different senses of ambiguous words .
 It turns out that the operators developed in this way are precisely the 
connectives of quantum logic ( Birkhoff and von Neumann , 1936 ) , which to our 
knowledge have not been exploited before in natural language processing .
 In quantum logic , arbitrary sets are replaced by linear subspaces of a vector 
space , and set unions , intersections and complements are replaced by vector 
sum , intersection and orthogonal complements of subspaces .
-We demonstrate that these logical connectives (particularly the orthogonal 
complement for negation) are powerful tools for exploring and analysing word 
meanings and show distinct advantages over Boolean operators in document 
retrieval experiments . 
+We demonstrate that these logical connectives ( particularly the orthogonal 
complement for negation )  are powerful tools for exploring and analysing word 
meanings and show distinct advantages over Boolean operators in document 
retrieval experiments .
 This paper is organised as follows . 
 In Section 1.1 we describe some of the ways vectors have been used to 
represent the meanings of terms and documents in natural language processing , 
and describe the way the WORD-SPACE used in our later experiments is built 
automatically from text corpora .
 In Section 1.2 we define the logical connectives on vector spaces , focussing 
particularly on negation and disjunction . 
@@ -16,12 +16,12 @@ Secondly , the link with ‘quantum l
 We propose two novel model architectures for computing continuous vector 
representations of words from very large data sets The quality of these 
representations is measured in a word similarity task , and the results are 
compared to the previously best performing techniques based on different types 
of neural networks .
 We observe large improvements in accuracy at much lower computational cost , i 
. e  it takes less than a day to learn high quality word vectors from a 1.6 
billion words data set .
 Furthermore , we show that these vectors provide state-of-the-art performance 
on our test set for measuring syntactic and semantic word similarities . 
-Information Retrieval (IR) models need to deal with two difficult issues , 
vocabulary mismatch and term dependencies .
+Information Retrieval ( IR)  models need to deal with two difficult issues , 
vocabulary mismatch and term dependencies .
 Vocabulary mismatch corresponds to the difficulty of retrieving relevant 
documents that do not contain exact query terms but semantically related terms .
 Term dependencies refers to the need of considering the relationship between 
the words of the query when estimating the relevance of a document .
 A multitude of solutions has been proposed to solve each of these two problems 
, but no principled model solve both .
 In parallel , in the last few years , language models based on neural networks 
have been used to cope with complex natural language processing tasks like 
emotion and paraphrase detection .
-Although they present good abilities to cope with both term dependencies and 
vocabulary mismatch problems , thanks to the distributed representation of 
words they are based upon , such models could not be used readily in IR , where 
the estimation of one language model per document (or query) is required .
+Although they present good abilities to cope with both term dependencies and 
vocabulary mismatch problems , thanks to the distributed representation of 
words they are based upon , such models could not be used readily in IR , where 
the estimation of one language model per document ( or query)  is required .
 This is both computationally unfeasible and prone to over-fitting .
 Based on a recent work that proposed to learn a generic language model that 
can be modified through a set of document-specific parameters , we explore use 
of new neural network models that are adapted to ad-hoc IR tasks .
 Within the language model IR framework , we propose and study the use of a 
generic language model as well as a document-specific language model .

Modified: labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt
URL: 
http://svn.apache.org/viewvc/labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt?rev=1734572&r1=1734571&r2=1734572&view=diff
==============================================================================
--- labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt (original)
+++ labs/yay/trunk/core/src/test/resources/word2vec/sentences.txt Fri Mar 11 
14:35:40 2016
@@ -24,12 +24,12 @@ However most of these models are built w
 This is problematic because words are often polysemous and global context can 
also provide useful information for learning word meanings .
 We present a new neural network architecture which 1) learns word embeddings 
that better capture the semantics of words by incorporating both local and 
global document context and 2) accounts for homonymy and polysemy by learning 
multiple embeddings per word .
 We introduce a new dataset with human judgments on pairs of words in 
sentential context and evaluate our model on it showing that our model 
outperforms competitive baselines and other neural language models .
-Information Retrieval (IR) models need to deal with two difficult issues 
vocabulary mismatch and term dependencies .
+Information Retrieval ( IR ) models need to deal with two difficult issues 
vocabulary mismatch and term dependencies .
 Vocabulary mismatch corresponds to the difficulty of retrieving relevant 
documents that do not contain exact query terms but semantically related terms .
 Term dependencies refers to the need of considering the relationship between 
the words of the query when estimating the relevance of a document .
 A multitude of solutions has been proposed to solve each of these two problems 
but no principled model solve both .
 In parallel in the last few years language models based on neural networks 
have been used to cope with complex natural language processing tasks like 
emotion and paraphrase detection .
-Although they present good abilities to cope with both term dependencies and 
vocabulary mismatch problems thanks to the distributed representation of words 
they are based upon such models could not be used readily in IR where the 
estimation of one language model per document (or query) is required .
+Although they present good abilities to cope with both term dependencies and 
vocabulary mismatch problems thanks to the distributed representation of words 
they are based upon such models could not be used readily in IR where the 
estimation of one language model per document ( or query ) is required .
 This is both computationally unfeasible and prone to over-fitting .
 Based on a recent work that proposed to learn a generic language model that 
can be modified through a set of document-specific parameters we explore use of 
new neural network models that are adapted to ad-hoc IR tasks .
 Within the language model IR framework we propose and study the use of a 
generic language model as well as a document-specific language model .
@@ -38,7 +38,7 @@ We experiment with such models and analy
 The word2vec model and application by Mikolov et al have attracted a great 
amount of attention in recent two years .
 The vector representations of words learned by word2vec models have been 
proven to be able to carry semantic meanings and are useful in various NLP 
tasks .
 As an increasing number of researchers would like to experiment with word2vec 
I notice that there lacks a material that comprehensively explains the 
parameter learning process of word2vec in details thus preventing many people 
with less neural network experience from understanding how exactly word2vec 
works .
-This note provides detailed derivations and explanations of the parameter 
update equations for the word2vec models including the original continuous 
bag-of-word (CBOW) and skip-gram models as well as advanced tricks hierarchical 
soft-max and negative sampling .
+This note provides detailed derivations and explanations of the parameter 
update equations for the word2vec models including the original continuous 
bag-of-word ( CBOW ) and skip-gram models as well as advanced tricks 
hierarchical soft-max and negative sampling .
 In the appendix a review is given on the basics of neuron network models and 
backpropagation .
 To avoid the inaccuracy caused by classifying the example into several 
categories given by TREC manually we take the word2vec to represent all 
attractions and user contexts in the continuous vector space learnt by neural 
network language models .
 The base of NNML is using neural networks for the probability function .



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@labs.apache.org
For additional commands, e-mail: commits-h...@labs.apache.org

Reply via email to