Demo the application of CNN for NLP task with varying length sequences using both padding and bucketing through custom iterator with MXNet R package

Example based on sentiment analysis on the IMDB data.

Load some packages

Data preparation

For this demo, the data preparation is performed by the script data_preprocessing_seq_to_one.R which involves the following steps:

Model architecture

cnn.symbol.factory <- function(seq_len, 
                               input_size,
                               num_embed, 
                               num_filters,
                               num_decode, 
                               dropout,
                               ignore_label=-1) {
  
  conv_params <- list(embed_weight=mx.symbol.Variable("embed_weight"),
                      conv1.weight = mx.symbol.Variable("conv1_weight"),
                      conv1.bias = mx.symbol.Variable("conv1_bias"),
                      conv2.weight = mx.symbol.Variable("conv2_weight"),
                      conv2.bias = mx.symbol.Variable("conv2_bias"),
                      conv3.weight = mx.symbol.Variable("conv3_weight"),
                      conv3.bias = mx.symbol.Variable("conv3_bias"),
                      fc1.weight = mx.symbol.Variable("fc1_weight"),
                      fc1.bias = mx.symbol.Variable("fc1_bias"),
                      fc_final.weight = mx.symbol.Variable("fc_final.weight"),
                      fc_final.bias = mx.symbol.Variable("fc_final.bias"))
  
  # embeding layer
  label <- mx.symbol.Variable("label")
  data <- mx.symbol.Variable("data")
  data_mask <- mx.symbol.Variable("data_mask")
  data_mask_array <- mx.symbol.Variable("data_mask_array")
  data_mask_array<- mx.symbol.BlockGrad(data_mask_array)
  
  embed <- mx.symbol.Embedding(data=data, weight=conv_params$embed_weight, input_dim=input_size, output_dim=num_embed, name="embed")
  embed <- mx.symbol.swapaxes(data = embed, dim1 = 1, dim2 = 2)
  embed_expand <- mx.symbol.expand_dims(data=embed, axis=-1, name="embed_expand")
  
  conv1<- mx.symbol.Convolution(data=embed_expand, weight=conv_params$conv1.weight, bias=conv_params$conv1.bias, kernel=c(1, 3), stride=c(1,1), pad=c(0,1), num_filter=8)
  act1<- mx.symbol.Activation(data=conv1, act.type="relu", name="act1")
  pool1<- mx.symbol.Pooling(data=act1, global.pool=F, pool.type="avg" , kernel=c(1,2), stride=c(1,2), pad=c(0,0), name="pool1")
  
  conv2<- mx.symbol.Convolution(data=pool1, weight=conv_params$conv2.weight, bias=conv_params$conv2.bias, kernel=c(1,3), stride=c(1,1), pad=c(0,1), num_filter=16)
  act2<- mx.symbol.Activation(data=conv2, act.type="relu", name="act2")
  pool2<- mx.symbol.Pooling(data=act2, global.pool=F, pool.type="avg" , kernel=c(1,2), stride=c(1,2), pad=c(0,0), name="pool2")
  
  conv3 <- mx.symbol.Convolution(data=pool2, weight=conv_params$conv3.weight, 
                                 bias=conv_params$conv3.bias, kernel=c(1, 3), 
                                 stride=c(1,1), pad=c(0,1), num_filter=32)
  act3 <- mx.symbol.Activation(data=conv3, act.type="relu", name="act3")
  pool3 <- mx.symbol.Pooling(data=act3, global.pool=T, pool.type="max", kernel=c(1,1), name="pool3")
  
  flatten <- mx.symbol.Flatten(data=pool3, name="flatten")
  
  # fc1 <- mx.symbol.FullyConnected(data=flatten, weight = conv_params$fc1.weight, 
  #                                 bias=conv_params$fc1.bias, num.hidden=16, name="fc1")
  # act_fc <- mx.symbol.Activation(data=fc1, act.type="relu", name="act_fc")
  # dropout<- mx.symbol.Dropout(data=act_fc, p=dropout, name="drop")
  
  fc_final<- mx.symbol.FullyConnected(data=flatten, weight=conv_params$fc_final.weight, bias=conv_params$fc_final.bias, num.hidden=2, name="fc_final")
  
  # Removed the ignore label in softmax
  loss <- mx.symbol.SoftmaxOutput(data = fc_final, label = label, name="loss")
  return(loss)
  
}

num_decode=2
num_embed=2
num_filters=16
seq_len = 600

symbol_cnn <- cnn.symbol.factory(seq_len = seq_len, input_size = vocab, 
                                 num_embed = num_embed, num_filters = num_filters, 
                                 num_decode = num_decode, dropout = 0.5, ignore_label = -1)

graph.viz(symbol_cnn, shape=c(seq_len, batch.size), direction = "LR")

Model training

devices <- mx.gpu()

initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 2.5)

optimizer <- mx.opt.create("rmsprop", learning.rate = 1e-3, gamma1 = 0.95, gamma2 = 0.92, 
                           wd = 1e-4, clip_gradient = 5, rescale.grad=1/batch.size)

logger <- mx.metric.logger()
epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger)
batch.end.callback <- mx.callback.log.train.metric(period = 50)

system.time(
  model <- mx.model.buckets(symbol = symbol_cnn,
                            train.data = train.data.single, eval.data = eval.data.single,
                            num.round = 5, ctx = devices, verbose = FALSE,
                            metric = mx.metric.accuracy, optimizer = optimizer,  
                            initializer = initializer,
                            batch.end.callback = batch.end.callback, 
                            epoch.end.callback = epoch.end.callback)
)
##    user  system elapsed 
##   44.35  104.89   47.31
mx.model.save(model, prefix = "../models/model_cnn_classif_single", iteration = 1)

Plot word embeddings

Word representation can be visualized by looking at the assigned weights in any of the embedding dimensions. Here, we look simultaneously at the two embeddings learnt in the LSTM model.

Since the model attempts to predict the sentiment, it’s no surprise that the 2 dimensions into which each word is projected appear correlated with words’ polarity. Positive words are associated with lower values along the first embedding (great, excellent), while the most negative words appear at the far right (terrible, worst).

Inference on test data

ctx <- mx.cpu(0)
batch.size <- 64

corpus_single_test <- readRDS(file = "../data/corpus_single_test.rds")

test.data <- mx.io.bucket.iter(buckets = corpus_single_test$buckets, batch.size = batch.size, 
                               data.mask.element = 0, shuffle = FALSE)
mx.symbol.bind = mxnet:::mx.symbol.bind
mx.nd.arraypacker = mxnet:::mx.nd.arraypacker
infer <- mx.infer.rnn(infer.data = test.data, model = model, ctx = ctx)

pred_raw <- t(as.array(infer))
pred <- max.col(pred_raw, tie = "first") - 1
label <- unlist(lapply(corpus_single_test$buckets, function(x) x$label))

acc <- sum(label == pred)/length(label)
roc <- roc(predictions = pred_raw[, 2], labels = factor(label))
auc <- auc(roc)

Accuracy: 85.0%

AUC: 0.943