Optimizer Exercises

Question 1

In this exercise, the task is to play around with the settings for the optimization of a neural network. We start by generating some (highly non-linear) synthetic data using the mlbench package.

library(torch)
data <- mlbench::mlbench.friedman3(n = 3000, sd = 0.1)
X <- torch_tensor(data$x)
X[1:2, ]
torch_tensor
   40.8977   745.3378     0.3715     3.4246
   88.3017  1148.7073     0.5288     6.5953
[ CPUFloatType{2,4} ]
Y <- torch_tensor(data$y)$unsqueeze(2)
Y[1:2, ]
torch_tensor
 1.5238
 1.3572
[ CPUFloatType{2,1} ]

The associated machine learning task is to predict the output Y from the input X.

Next, we create a dataset for it using the tensor_dataset() function.

ds <- tensor_dataset(X, Y)
ds$.getitem(1)
[[1]]
torch_tensor
  40.8977
 745.3378
   0.3715
   3.4246
[ CPUFloatType{4} ]

[[2]]
torch_tensor
 1.5238
[ CPUFloatType{1} ]

We can create two sub-datasets – for training and validation – using dataset_subset().

ids_train <- sample(1000, 700)
ids_valid <- setdiff(seq_len(1000), ids_train)
ds_train <- dataset_subset(ds, ids_train)
ds_valid <- dataset_subset(ds, ids_valid)

The network that we will be fitting is a simple MLP:

nn_mlp <- nn_module("nn_mlp",
  initialize = function() {
    self$lin1 <- nn_linear(4, 50)
    self$lin2 <- nn_linear(50, 50)
    self$lin3 <- nn_linear(50, 1)
  },
  forward = function(x) {
    x |>
      self$lin1() |>
      nnf_relu() |>
      self$lin2() |>
      nnf_relu() |>
      self$lin3()
  }
)
The code to compare different optimizer configurations is provided via the compare_configs() function, which you can copy paste.
Implementation of compare_configs
library(ggplot2)
compare_configs <- function(epochs = 30, batch_size = 16, lr = 0.01, weight_decay = 0.01, beta1 = 0.9, beta2 = 0.999) {
  # Identify which parameter is a list
  args <- list(batch_size = batch_size, lr = lr, weight_decay = weight_decay, beta1 = beta1, beta2 = beta2)
  is_list <- sapply(args, is.list)

  if (sum(is_list) != 1) {
    stop("One of the arguments must be a list")
  }

  list_arg_name <- names(args)[is_list]
  list_args <- args[[list_arg_name]]
  other_args <- args[!is_list]

  # Run train_valid for each value in the list
  results <- lapply(list_args, function(arg) {
    network <- with_torch_manual_seed(seed = 123, nn_mlp())
    other_args[[list_arg_name]] <- arg
    train_valid(network, ds_train = ds_train, ds_valid = ds_valid, epochs = epochs, batch_size = other_args$batch_size,
      lr = other_args$lr, betas = c(other_args$beta1, other_args$beta2), weight_decay = other_args$weight_decay)
  })

  # Combine results into a single data frame
  combined_results <- do.call(rbind, lapply(seq_along(results), function(i) {
    df <- results[[i]]
    df$config <- paste(list_arg_name, "=", list_args[[i]])
    df
  }))

  upper <- if (max(combined_results$valid_loss) > 10) quantile(combined_results$valid_loss, 0.98) else max(combined_results$valid_loss)

  ggplot(combined_results, aes(x = epoch, y = valid_loss, color = config)) +
    geom_line() +
    theme_minimal() +
    labs(x = "Epoch", y = "Validation RMSE", color = "Configuration") +
    ylim(min(combined_results$valid_loss), upper)
}
train_loop <- function(network, dl_train, opt) {
  network$train()
  coro::loop(for (batch in dl_train) {
    opt$zero_grad()
    Y_pred <- network(batch[[1]])
    loss <- nnf_mse_loss(Y_pred, batch[[2]])
    loss$backward()
    opt$step()
  })
}

valid_loop <- function(network, dl_valid) {
  network$eval()
  valid_loss <- c()
  coro::loop(for (batch in dl_valid) {
    Y_pred <- with_no_grad(network(batch[[1]]))
    loss <- sqrt(nnf_mse_loss(Y_pred, batch[[2]]))
    valid_loss <- c(valid_loss, loss$item())
  })
  mean(valid_loss)
}

train_valid <- function(network, ds_train, ds_valid, epochs, batch_size, ...) {
  opt <- optim_ignite_adamw(network$parameters, ...)
  train_losses <- numeric(epochs)
  valid_losses <- numeric(epochs)
  dl_train <- dataloader(ds_train, batch_size = batch_size)
  dl_valid <- dataloader(ds_valid, batch_size = batch_size)
  for (epoch in seq_len(epochs)) {
    train_loop(network, dl_train, opt)
    valid_losses[epoch] <- valid_loop(network, dl_valid)
  }
  data.frame(epoch = seq_len(epochs), valid_loss = valid_losses)
}

It takes as arguments:

You can, e.g., call the function like below:

compare_configs(epochs = 30, lr = list(0.1, 0.2), weight_decay = 0.02)

One of the arguments (except for epochs) must be a list of values. The function will then run the same training configuration for each of the values in the list and visualize the results.

Explore a few hyperparameter settings and make some observations as to how they affect the trajectory of the validation loss. The purpose of this exercise is not to derive observations that hold in general.

Question 2: Optimization with Momentum

In this exercise, you will build a gradient descent optimizer with momentum. As a use case, we will minimize the Rosenbrock function. The function is defined as:

rosenbrock <- function(x, y) {
  (1 - x)^2 + 2 * (y - x^2)^2
}
rosenbrock(torch_tensor(-1), torch_tensor(-1))
torch_tensor
 12
[ CPUFloatType{1} ]

The ‘parameters’ we will be optimizing is the position of a point (x, y), which will both be updated using gradient descent. The figure below shows the Rosenbrock function, where darker values indicate lower values.

The task is to implement the optim_step() function.

optim_step <- function(x, y, lr, x_momentum, y_momentum, beta) {
  ...
}

It will receive as arguments, the current values x and y, as well as the momentum values x_momentum and y_momentum (all scalar tensors). The function should then:

  1. Perform a forward pass.
  2. Calculate the gradients.
  3. Update the momentum values in-place.
  4. Update the parameters in=-place.
Hint

To perform in-place updates, you can use the $mul_() and $add_() methods.

The update rule is given exemplarily for x:

\[ \begin{aligned} v_{t + 1} &= \beta v_t + (1 - \beta) \nabla_{x} f(x_t, y_t) \\ \end{aligned} \]

\[ \begin{aligned} x_{t+1} &= x_t - \eta v_{t + 1} \end{aligned} \]

To test your optimizer, you can use the code below. Note that you might have to play around with the number of steps and the learning rate to get a good result.

Code to test your optimizer
optimize_rosenbrock <- function(steps, lr, beta) {
  x <- torch_tensor(-1, requires_grad = TRUE)
  y <- torch_tensor(2, requires_grad = TRUE)
  momentum_x <- torch_tensor(0)
  momentum_y <- torch_tensor(0)

  trajectory <- data.frame(
    x = numeric(steps + 1),
    y = numeric(steps + 1),
    value = numeric(steps + 1)
  )
  for (step in seq_len(steps)){
    optim_step(x, y, lr, momentum_x, momentum_y, beta)
    x$grad$zero_()
    y$grad$zero_()
    trajectory$x[step] <- x$item()
    trajectory$y[step] <- y$item()
  }
  trajectory$x[steps + 1] <- x$item()
  trajectory$y[steps + 1] <- y$item()

  plot_rosenbrock() +
    geom_path(data = trajectory, aes(x = x, y = y, z = NULL), color = "red") +
    labs(title = "Optimization Path on Rosenbrock Function", x = "X-axis", y = "Y-axis")
}

Question 3: Weight Decay

In exercise 2, we have optimized the Rosenbrock function. Does it make sense to also use weight decay for this optimization?