[GitHub] ankkhedia commented on a change in pull request #12670: [MXNET-833] [R] Char-level RNN tutorial fix

GitBox Wed, 26 Sep 2018 15:15:45 -0700

ankkhedia commented on a change in pull request #12670: [MXNET-833] [R] 
Char-level RNN tutorial fix
URL: https://github.com/apache/incubator-mxnet/pull/12670#discussion_r220740407


 ##########
 File path: R-package/vignettes/CharRnnModel.Rmd
 ##########
 @@ -36,237 +32,262 @@ download.data <- function(data_dir) {
 }
 ```
 
-Make dictionary from text.
-
-```{r}
-make.dict <- function(text, max.vocab=10000) {
-    text <- strsplit(text, '')
-    dic <- list()
-    idx <- 1
-    for (c in text[[1]]) {
-        if (!(c %in% names(dic))) {
-            dic[[c]] <- idx
-            idx <- idx + 1
-        }
-    }
-    if (length(dic) == max.vocab - 1)
-        dic[["UNKNOWN"]] <- idx
-    cat(paste0("Total unique char: ", length(dic), "\n"))
-    return (dic)
-}
-```
-
-Transfer text into data feature.
-
-```{r}
-make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) {
-    fi <- file(file.path, "r")
-    text <- paste(readLines(fi), collapse="\n")
-    close(fi)
-
-    if (is.null(dic))
-        dic <- make.dict(text, max.vocab)
-    lookup.table <- list()
-    for (c in names(dic)) {
-        idx <- dic[[c]]
-        lookup.table[[idx]] <- c 
-    }
-
-    char.lst <- strsplit(text, '')[[1]]
-    num.seq <- as.integer(length(char.lst) / seq.len)
-    char.lst <- char.lst[1:(num.seq * seq.len)]
-    data <- array(0, dim=c(seq.len, num.seq))
-    idx <- 1
-    for (i in 1:num.seq) {
-        for (j in 1:seq.len) {
-            if (char.lst[idx] %in% names(dic))
-                data[j, i] <- dic[[ char.lst[idx] ]]-1
-            else {
-                data[j, i] <- dic[["UNKNOWN"]]-1
-            }
-            idx <- idx + 1
-        }
-    }
-    return (list(data=data, dic=dic, lookup.table=lookup.table))
+Next we transform the test into feature vectors that is fed into the RNN 
model. The `make_data` function reads the dataset, cleans it of any 
non-alphanumeric characters, splits it into individual characters and groups it 
into sequences of length `seq.len`.
+
+
+```R
+make_data <- function(path, seq.len=32, dic=NULL) {
+  
+  text_vec <- read_file(file = path)
+  text_vec <- stri_enc_toascii(str = text_vec)
+  text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", 
replacement = "")
+  text_vec <- strsplit(text_vec, '') %>% unlist
+  
+  if (is.null(dic)) {
+    char_keep <- sort(unique(text_vec))
+  } else char_keep <- names(dic)[!dic == 0]
+  
+  # Remove terms not part of dictionnary
+  text_vec <- text_vec[text_vec %in% char_keep]
+  
+  # Build dictionnary
+  dic <- 1:length(char_keep)
+  names(dic) <- char_keep
+  
+  # reverse dictionnary
+  rev_dic <- names(dic)
+  names(rev_dic) <- dic
+  
+  # Adjust by -1 to have a 1-lag for labels
+  num.seq <- as.integer(floor((length(text_vec)-1)/seq.len))
+  
+  features <- dic[text_vec[1:(seq.len*num.seq)]] 
+  labels <- dic[text_vec[1:(seq.len*num.seq)+1]]
+  
+  features_array <- array(features, dim=c(seq.len, num.seq))
+  labels_array <- array(labels, dim=c(seq.len, num.seq))
+  
+  return (list(features_array=features_array, labels_array=labels_array, 
dic=dic, rev_dic=rev_dic))
 }
-```
 
-Move tail text.
-
-```{r}
-drop.tail <- function(X, batch.size) {
-    shape <- dim(X)
-    nstep <- as.integer(shape[2] / batch.size)
-    return (X[, 1:(nstep * batch.size)])
-}
-```
 
-Get the label of X
-
-```{r}
-get.label <- function(X) {
-    label <- array(0, dim=dim(X))
-    d <- dim(X)[1]
-    w <- dim(X)[2]
-    for (i in 0:(w-1)) {
-        for (j in 1:d) {
-            label[i*d+j] <- X[(i*d+j)%%(w*d)+1]
-        }
-    }
-    return (label)
-}
+seq.len <- 100
+data_prep <- make_data(path = "input.txt", seq.len=seq.len, dic=NULL)
 ```
 
-Get training data and eval data
+Fetch the features and labels for training the model, and split the data into 
training and evaluation in 9:1 ratio.
 
-```{r}
-download.data("./data/")
-ret <- make.data("./data/input.txt", seq.len=seq.len)
-X <- ret$data
-dic <- ret$dic
-lookup.table <- ret$lookup.table
 
+```R
+X <- data_prep$features_array
+Y <- data_prep$labels_array
+dic <- data_prep$dic
+rev_dic <- data_prep$rev_dic
 vocab <- length(dic)
 
-shape <- dim(X)
+samples <- tail(dim(X), 1)
 train.val.fraction <- 0.9
-size <- shape[2]
-
-X.train.data <- X[, 1:as.integer(size * train.val.fraction)]
-X.val.data <- X[, -(1:as.integer(size * train.val.fraction))]
-X.train.data <- drop.tail(X.train.data, batch.size)
-X.val.data <- drop.tail(X.val.data, batch.size)
-
-X.train.label <- get.label(X.train.data)
-X.val.label <- get.label(X.val.data)
-
-X.train <- list(data=X.train.data, label=X.train.label)
-X.val <- list(data=X.val.data, label=X.val.label)
-```
-
-## Training Model
 
+X.train.data <- X[, 1:as.integer(samples * train.val.fraction)]
+X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))]
 
-In `mxnet`, we have a function called `mx.lstm` so that users can build a 
general lstm model. 
+X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)]
+X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))]
 
-```{r}
-model <- mx.lstm(X.train, X.val, 
-                 ctx=mx.cpu(),
-                 num.round=num.round, 
-                 update.period=update.period,
-                 num.lstm.layer=num.lstm.layer, 
-                 seq.len=seq.len,
-                 num.hidden=num.hidden, 
-                 num.embed=num.embed, 
-                 num.label=vocab,
-                 batch.size=batch.size, 
-                 input.size=vocab,
-                 initializer=mx.init.uniform(0.1), 
-                 learning.rate=learning.rate,
-                 wd=wd,
-                 clip_gradient=clip_gradient)
+train_buckets <- list("100"=list(data=X.train.data, label=X.train.label))
+eval_buckets <- list("100"=list(data=X.val.data, label=X.val.label))
 
+train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic)
+eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)
 ```
 
-## Inference from model
+Create iterators for training and evaluation datasets.
 
 
-Some helper functions for random sample.
+```R
+vocab <- length(eval_buckets$dic)
 
-```{r}
-cdf <- function(weights) {
-    total <- sum(weights)
-    result <- c()
-    cumsum <- 0
-    for (w in weights) {
-        cumsum <- cumsum+w
-        result <- c(result, cumsum / total)
-    }
-    return (result)
-}
+batch.size = 32
 
-search.val <- function(cdf, x) {
-    l <- 1
-    r <- length(cdf) 
-    while (l <= r) {
-        m <- as.integer((l+r)/2)
-        if (cdf[m] < x) {
-            l <- m+1
-        } else {
-            r <- m-1
-        }
-    }
-    return (l)
-}
+train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = 
batch.size, 
+                                data.mask.element = 0, shuffle = TRUE)
 
-choice <- function(weights) {
-    cdf.vals <- cdf(as.array(weights))
-    x <- runif(1)
-    idx <- search.val(cdf.vals, x)
-    return (idx)
-}
+eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = 
batch.size,
+                               data.mask.element = 0, shuffle = FALSE)
 ```
 
-We can use random output or fixed output by choosing largest probability.
-
-```{r}
-make.output <- function(prob, sample=FALSE) {
-    if (!sample) {
-        idx <- which.max(as.array(prob))
-    }
-    else {
-        idx <- choice(prob)
-    }
-    return (idx)
-
+## Train the Model
+
+
+This model is a multi-layer RNN for sampling from character-level language 
models. It has a one-to-one model configuration since for each character, we 
want to predict the next one. For a sequence of length 100, there are also 100 
labels, corresponding the same sequence of characters but offset by a position 
of +1. The parameters output_last_state is set to TRUE in order to access the 
state of the RNN cells when performing inference.
+
 
 Review comment:
   Would it be helpful if we mention that currently only one-to-one and 
many-to-one flavours are supported.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] ankkhedia commented on a change in pull request #12670: [MXNET-833] [R] Char-level RNN tutorial fix

Reply via email to