[GitHub] [systemds] A-Postl commented on a diff in pull request #1625: [SYSTEMDS-3303] NN Builtin: Attention Layer

GitBox Thu, 02 Jun 2022 11:44:11 -0700


A-Postl commented on code in PR #1625:
URL: https://github.com/apache/systemds/pull/1625#discussion_r888277645



##########
scripts/nn/examples/AttentionExample.dml:
##########
@@ -0,0 +1,442 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+# We implement a simple example using the basic self-attention
+# mechanism in combination with a LSTM recurrent layer.
+#
+# We use the clickbait dataset
+# 
(https://www.kaggle.com/datasets/amananandrai/clickbait-dataset?select=clickbait_data.csv)
+# which is a simple binary text classification with 32000 samples.
+#-------------------------------------------------------------
+
+
+source("nn/layers/attention.dml") as attention
+source("nn/layers/affine.dml") as affine
+source("nn/layers/lstm.dml") as lstm
+source("nn/layers/relu.dml") as relu
+source("nn/layers/sigmoid.dml") as sigmoid
+source("nn/optim/adam.dml") as adam
+source("nn/layers/log_loss.dml") as log_loss
+
+
+# 1 get data
+data_loc = "scripts/nn/examples/data/"
+tableschema = "string,int"
+N=32000 # Samples of whole dataset
+n=8000  # Samples to use for training
+max_length = 32 # maximum sequence length
+epochs = 30
+batch_size = n/200
+val_size = batch_size * 5
+
+data = read(data_loc + "clickbait_data.csv", format="csv", header=TRUE, 
sep=",", data_type="frame", schema=tableschema, cols=2, rows=N)
+
+
+[x_train, y_train, vocab_size] = preprocess(data, max_length, N)
+
+x_train = x_train[1:n]
+y_train = y_train[1:n]
+
+# train network
+# TODO fix: get error when batch size is not always equal
+
+[biases, weights] = train(x_train, y_train, epochs, batch_size, max_length, 
vocab_size, val_size)
+
+
+preprocess = function(frame[unknown] data, integer max_length, integer n)
+  return (matrix[double] features, matrix[double] targets, integer vocab_size)
+{
+  /*
+   * Preprocess the raw text data into integer tokens, shuffles data and 
targets.
+   *
+   * Inputs:
+   * - data: dataframe with [string, int] schema and n rows.
+   * - max_length: maximum sequence length we use for training
+   * - n: number of samples.
+   *
+   * Outputs:
+   * - features: feature matrix of shape (n, max_sequence_length)
+   * - targets: labels vector of shape (n,1)
+   * - vocab_size: vocabulary size, used to define size of embedding matrix 
during training.
+   */
+
+  # map to lowercase, remove non alphanumeric characters
+  formatted = map(data[,1], "x -> 
x.toLowerCase().replaceAll(\"[^\\p{Alnum}]+\", \" \").replaceAll(\"[\\s]+\", \" 
\")")
+  ids = as.frame(seq(1,nrow(formatted),1))
+  formatted = cbind(ids, formatted)
+
+  # prepare feature matrix for lstm into one-hot-encoded sequences
+  spec = "{\"algo\" : \"split\", \"out\": \"position\", \"tokenize_col\": 2, 
\"id_cols\": [1]}"
+  tokenized = tokenize(target=formatted, spec=spec, max_tokens=max_length)
+  recode_spec = "{ \"recode\": [C3]}"
+  [tokens, mapping] = transformencode(target=tokenized, spec=recode_spec)
+  features = matrix(0, rows=n, cols=max_length)
+  row_old = as.scalar(tokens[1,1])
+  pos = 1
+  for(i in 1:nrow(tokens))
+  {
+    row = as.scalar(tokens[i,1])
+    if (row != row_old)
+    {
+      row_old = row
+      pos = 1
+    }
+    features[row,pos] = tokens[i,3]
+    pos += 1
+  }
+  features = replace(target=features, pattern = NaN, replacement = -1)
+  features = features + 2
+  vocab_size = as.integer(max(features))
+
+  targets = as.matrix(data[,2])
+
+  #shuffle data
+  r = rand(rows=n, cols=1, min=0, max=1, pdf="uniform")
+  x = order(target=cbind(r,features), by=1)
+  y = order(target=cbind(r,targets), by=1)
+  features = x[,2:ncol(x)]
+  targets = y[,2:ncol(y)]
+}
+
+train = function( matrix[double] x_train,
+                  matrix[double] y_train,
+                  integer epochs,
+                  integer batch_size,
+                  integer max_sequence_length,
+                  integer vocab_size,
+                  integer val_size
+)
+  return(List[unknown] biases, List[unknown] weights)
+{
+  /*
+   * Trains our example model
+   *
+   * Inputs:
+   * - x_train: training features, matrix of shape (N,max_sequence_length).
+   * - y_train: training labels, matrix of shape (N,1).
+   * - epochs: number of epochs to train our model.
+   * - batch_size: batch size we use in each iteration.
+   * - max_length: maximum sequence length of data.
+   * - vocab_size: Size of our considered vocabulary.
+   * - val_size: Size of the validation set, which is subtracted from x_train 
and y_train.
+   *
+   * Outputs:
+   * - biases: list of biases.
+   * - weights: list of weights.
+   */
+  samples = nrow(x_train)
+  print("Start Training")
+
+  #validation split
+  x_val = x_train[1:val_size]
+  y_val = y_train[1:val_size]
+
+  x_train = x_train[val_size+1:samples]
+  y_train = y_train[val_size+1:samples]
+
+  samples = nrow(x_train)
+  features = ncol(x_train)
+  output_size = 1
+
+  # We use a trainable embedding, each row is an embedding for a word
+  embedding_size = 64
+  W_E = rand(rows=vocab_size, cols=embedding_size)
+
+  # 1 lstm layer
+  lstm_neurons = 150
+  [W_0, b_0, out0, c0] = lstm::init(batch_size, embedding_size, lstm_neurons)
+
+  # 2 attention layer: no weights
+
+  # 3 dense layer -> (hidden_size)
+  hidden_neurons = 128
+
+  [W_1, b_1] = affine::init(max_sequence_length * lstm_neurons, 
hidden_neurons, -1)
+
+  # 4 dense layer -> (output_size)
+  [W_2, b_2] = affine::init(hidden_neurons, output_size, -1)
+
+  # 5 sigmoid layer: no weights
+
+  # put weights & biases into list
+  biases = list(b_0, b_1, b_2)
+  weights = list(W_0, W_1, W_2, W_E)
+
+  #optimizer init
+  [mW_E, vW_E] = adam::init(W_E)
+
+  [mW_0, vW_0] = adam::init(W_0)
+  [mW_1, vW_1] = adam::init(W_1)
+  [mW_2, vW_2] = adam::init(W_2)
+
+  [mb_0, vb_0] = adam::init(b_0)
+  [mb_1, vb_1] = adam::init(b_1)
+  [mb_2, vb_2] = adam::init(b_2)
+
+  #optimizer params
+  lr = 0.001
+  beta1 = 0.99
+  beta2 = 0.999
+  epsilon = 1e-8
+  t = 0
+
+  #training loop
+  iters = ceil(samples/batch_size)
+  for (ep in 1:epochs)
+  {
+    print("Start ep: " + ep)
+    loss = 0
+    for (i in 1:iters)
+    {
+      print("Iteration: " + i)
+      # 1 Get batch data
+      start = ((i-1) * batch_size) %% samples + 1
+      end = min(samples, start + batch_size -1)
+      #TODO fix batch size problem
+
+      x_batch = x_train[start:end,]
+      y_batch = y_train[start:end,]
+
+      # 2 predict
+      [y_hat, out5, out4, out3, out2, out1, emb, cache_out_out, cache_c_out, 
cache_ifog_out] =
+          predict(x_batch, biases, weights, max_sequence_length, 
embedding_size, lstm_neurons)
+
+      # 3 loss
+      loss = loss + log_loss::forward(y_hat, y_batch)
+
+      # 3 backpropagation
+
+      dout = log_loss::backward(y_hat, y_batch)
+      dprobs = sigmoid::backward(dout, out5)
+      [dout_2, dW_2, db_2] = affine::backward(dprobs, out4, W_2, b_2)
+      drelu = relu::backward(dout_2, out3)
+      [dout_1, dW_1, db_1] = affine::backward(drelu, out2, W_1, b_1)
+      datt = attention::backward(dout_1, out1, out1, out1, max_sequence_length)
+      dc = matrix(0, rows=nrow(x_batch), cols=lstm_neurons)
+      [dEmb, dW_0, db_0, dout0, dc0] = lstm::backward(datt,
+                                                        dc,
+                                                        emb,
+                                                        W_0,
+                                                        b_0,
+                                                        max_sequence_length,
+                                                        embedding_size,
+                                                        TRUE,
+                                                        out0,
+                                                        c0,
+                                                        cache_out_out,
+                                                        cache_c_out,
+                                                        cache_ifog_out)
+
+      # 4 update weights & biases
+
+      t = ep * i - 1
+      # #embedding
+      [W_E, mW_E, vW_E] = update_embeddings(x_batch, dEmb, W_E, mW_E, vW_E,
+        lr, beta1, beta2, epsilon, t, max_sequence_length, embedding_size)
+
+      # lstm
+      [b_0, mb_0, vb_0] = adam::update(b_0, db_0, lr, beta1, beta2, epsilon, 
t, mb_0, vb_0)
+      [W_0, mW_0, vW_0] = adam::update(W_0, dW_0, lr, beta1, beta2, epsilon, 
t, mW_0, vW_0)
+
+      # hidden affine
+      [b_1, mb_1, vb_1] = adam::update(b_1, db_1, lr, beta1, beta2, epsilon, 
t, mb_1, vb_1)
+      [W_1, mW_1, vW_1] = adam::update(W_1, dW_1, lr, beta1, beta2, epsilon, 
t, mW_1, vW_1)
+
+      # output affine
+      [b_2, mb_2, vb_2] = adam::update(b_2, db_2, lr, beta2, beta2, epsilon, 
t, mb_2, vb_2)
+      [W_2, mW_2, vW_2] = adam::update(W_2, dW_2, lr, beta2, beta2, epsilon, 
t, mW_2, vW_2)
+
+      # put weights & biases into list
+      biases = list(b_0,b_1,b_2)
+      weights = list(W_0,W_1,W_2,W_E)
+
+    }
+    val_loss = evaluate(x_val, y_val, biases, weights, lstm_neurons, 
max_sequence_length, embedding_size, batch_size)
+    loss = loss / iters
+    print("Epoch: " + ep + "; Train Loss: " + loss + "; Val. Loss: " + 
val_loss)
+  }
+}
+
+predict = function( matrix[double] x,
+                    List[unknown] biases,
+                    List[unknown] weights,
+                    integer max_sequence_length,
+                    integer embedding_size,
+                    integer lstm_neurons)
+  return (matrix[double] y_hat, matrix[double] out5, matrix[double] out4, 
matrix[double] out3,
+          matrix[double] out2, matrix[double] out1, matrix[double] emb, 
matrix[double] cache_out_out,
+          matrix[double] cache_c_out, matrix[double] cache_ifog_out)
+{
+  /*
+   * Predicts an output y_hat for given samples x.
+   *
+   * Inputs:
+   * - x: sample features of shape(batch_size, max_sequence_length)
+   * - biases: list of biases of length 3 (lstm, affine, affine)
+   * - weights: list of weights of length 4 (lstm, affine, affine, embedding)
+   * - max_sequence_length: number of words per sample.
+   * - embedding_size: size of embedding vector for 1 word
+   * - lstm_neurons: number of neurons in lstm layer.
+   *
+   * Outputs:
+   * - y_hat: matrix of shape(batch_size, 1), prediction for log-loss, output 
of sigmoid layer
+   * - out5: output of final affine layer, shape(batch_size, 1)
+   * - out4: output of relu layer
+   * - out3: output of hidden affine layer
+   * - out2: output of attention layer
+   * - out1: output states from lstm layer, of shape(batch_size, 
max_sequence_lengt * lstm_neurons)
+   * - cache_out_out: cache_out output from lstm layer
+   * - cahce_c_out: cache_c output from lstm layer
+   * - cache_ifog_out: cahce_ifog output from lstm layer
+   */
+
+  # unpack weights & biases
+  W_0 = as.matrix(weights[1])
+  W_1 = as.matrix(weights[2])
+  W_2 = as.matrix(weights[3])
+  W_E = as.matrix(weights[4])
+
+  b_0 = as.matrix(biases[1])
+  b_1 = as.matrix(biases[2])
+  b_2 = as.matrix(biases[3])
+
+  # fetch embedding
+  emb = fetch_embeddings(x, W_E, max_sequence_length, embedding_size)
+  # put input through layers
+  batch_size = nrow(x)
+  out0 = matrix(0, batch_size, lstm_neurons)
+  c0 = out0
+  [out1, c_out, cache_out_out, cache_c_out, cache_ifog_out]=
+    lstm::forward(emb, W_0, b_0, max_sequence_length, embedding_size, TRUE, 
out0, c0)
+  out2 = attention::forward(out1, out1, out1, max_sequence_length)
+  out3 = affine::forward(out2, W_1, b_1)
+  out4 = relu::forward(out3)
+  out5 = affine::forward(out4, W_2, b_2)
+  y_hat = sigmoid::forward(out5)
+}
+
+fetch_embeddings = function(matrix[double] indexes, matrix[double] W_E,

Review Comment:
   not sure if there is a better way, to implement a learnable embedding yet in 
systemds



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [systemds] A-Postl commented on a diff in pull request #1625: [SYSTEMDS-3303] NN Builtin: Attention Layer

Reply via email to