--- title: "Automatic Differentiation with RTMB" output: html_vignette: toc: yes toc_depth: 3 author: "Andrew Johnson" date: "`r Sys.Date()`" vignette: > %\VignetteIndexEntry{Automatic Differentiation with RTMB} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ``` ```{r setup} library(StanEstimators) ``` # Introduction Stan's algorithms, including MCMC sampling (NUTS), optimization (L-BFGS), variational inference, Pathfinder, and Laplace approximation, are gradient-based methods. This means they require not only the log-probability function but also its gradient (the vector of partial derivatives with respect to each parameter) to work efficiently. `StanEstimators` provides three ways to compute gradients: 1. **Finite differences** (default): Automatic but slow. Approximates gradients by evaluating the function at slightly perturbed parameter values. 2. **Analytical gradients**: Fast and accurate, but requires you to manually derive and code the gradient function. 3. **RTMB automatic differentiation**: Fast and automatic. Uses the `RTMB` package to compute exact gradients via automatic differentiation (AD). ## Installing RTMB To use RTMB with `StanEstimators`, you need to install the `RTMB`, `withr`, and `future` packages: ```{r, eval=FALSE} install.packages(c("RTMB", "withr", "future")) ``` Once installed, simply set `grad_fun = "RTMB"` in any `StanEstimators` function to enable automatic differentiation. For basic usage of `StanEstimators`, see the [Getting Started](Getting-Started.html) vignette. # Poisson Regression Next, we'll examine a generalized linear model (GLM) for count data. Poisson regression uses a log-link function: $\log(\lambda_i) = X_i\beta$, where $\lambda_i$ is the expected count. ## Simulating Data ```{r} set.seed(456) n <- 200 p <- 2 # number of predictors (plus intercept) # True coefficients true_beta <- c(0.5, 1.2, -0.8) # Design matrix (intercept + 2 predictors) X <- cbind(1, matrix(rnorm(n * p), n, p)) # Generate Poisson counts lambda <- exp(X %*% true_beta) y_pois <- rpois(n, lambda) ``` ## Defining the Log-Likelihood ```{r} poisson_loglik <- function(pars, y, X) { eta <- X %*% pars lambda <- exp(eta) sum(dpois(y, lambda, log = TRUE)) } ``` ## Performance Comparison ```{r, results=FALSE, message=FALSE, warning=FALSE} inits_pois <- rep(0, 3) # Finite differences timing_pois_fd <- system.time({ fit_pois_fd <- stan_sample(poisson_loglik, inits_pois, additional_args = list(y = y_pois, X = X), num_chains = 1, seed = 1234) }) # RTMB timing_pois_rtmb <- system.time({ fit_pois_rtmb <- stan_sample(poisson_loglik, inits_pois, grad_fun = "RTMB", additional_args = list(y = y_pois, X = X), num_chains = 1, seed = 1234) }) ``` ## Results ```{r} timing_results_pois <- data.frame( Method = c("Finite Differences", "RTMB"), Time_seconds = c(timing_pois_fd[3], timing_pois_rtmb[3]), Speedup = c(1, timing_pois_fd[3] / timing_pois_rtmb[3]) ) knitr::kable(timing_results_pois, digits = 2, caption = "Performance comparison for Poisson regression") ``` ```{r} summary(fit_pois_rtmb) ``` RTMB handles the matrix operations and log-link function automatically, providing improved performance (typically 8-10x speedup) while correctly recovering the true parameter values. # Logistic Regression Logistic regression models binary outcomes using a logit link: $\text{logit}(p_i) = X_i\beta$, where $p_i$ is the probability of success. ## Simulating Data ```{r} set.seed(789) n <- 300 p <- 2 # True coefficients true_beta_logit <- c(0.2, 1.5, -1.0) # Design matrix X_logit <- cbind(1, matrix(rnorm(n * p), n, p)) # Generate binary outcomes eta <- X_logit %*% true_beta_logit prob <- plogis(eta) # inverse logit y_binom <- rbinom(n, size = 1, prob = prob) ``` ## Defining the Log-Likelihood ```{r} logistic_loglik <- function(pars, y, X) { eta <- X %*% pars p <- plogis(eta) sum(dbinom(y, size = 1, prob = p, log = TRUE)) } ``` ## Performance Comparison ```{r, results=FALSE, message=FALSE, warning=FALSE} inits_logit <- rep(0, 3) # Finite differences timing_logit_fd <- system.time({ fit_logit_fd <- stan_sample(logistic_loglik, inits_logit, additional_args = list(y = y_binom, X = X_logit), num_chains = 1, seed = 1234) }) # RTMB timing_logit_rtmb <- system.time({ fit_logit_rtmb <- stan_sample(logistic_loglik, inits_logit, grad_fun = "RTMB", additional_args = list(y = y_binom, X = X_logit), num_chains = 1, seed = 1234) }) ``` ## Results ```{r} timing_results_logit <- data.frame( Method = c("Finite Differences", "RTMB"), Time_seconds = c(timing_logit_fd[3], timing_logit_rtmb[3]), Speedup = c(1, timing_logit_fd[3] / timing_logit_rtmb[3]) ) knitr::kable(timing_results_logit, digits = 2, caption = "Performance comparison for Logistic regression") ``` ```{r} summary(fit_logit_rtmb) ``` # Gaussian Mixture Model Mixture models represent complex latent structure and demonstrate RTMB's benefits for challenging models. We'll fit a two-component Gaussian mixture. The model is: $p(y) = \pi \cdot N(\mu_1, \sigma_1^2) + (1-\pi) \cdot N(\mu_2, \sigma_2^2)$ ## Simulating Data ```{r} set.seed(101) n <- 400 # True parameters true_pi <- 0.3 true_mu1 <- -2 true_mu2 <- 3 true_sigma1 <- 1 true_sigma2 <- 1.5 # Generate mixture data component <- rbinom(n, 1, true_pi) y_mix <- ifelse(component == 1, rnorm(n, true_mu1, true_sigma1), rnorm(n, true_mu2, true_sigma2)) ``` ## Defining the Log-Likelihood ```{r} mixture_loglik <- function(pars, y) { # Transform parameters to satisfy constraints pi <- pars[1] # mixing proportion in [0,1] mu1 <- pars[2] mu2 <- pars[3] sigma1 <- pars[4] # positive sigma2 <- pars[5] # positive # Log-likelihood for each component log_lik1 <- dnorm(y, mu1, sigma1, log = TRUE) + log(pi) log_lik2 <- dnorm(y, mu2, sigma2, log = TRUE) + log(1 - pi) sum(log(exp(log_lik1) + exp(log_lik2))) } ``` ## Performance Comparison ```{r, results=FALSE, message=FALSE, warning=FALSE} # Initialize near true values (mixture models can have multimodality) inits_mix <- c(0.3, -2, 3, 1, 1.5) # Finite differences timing_mix_fd <- system.time({ fit_mix_fd <- stan_sample(mixture_loglik, inits_mix, lower = c(0, -Inf, -Inf, 0, 0), upper = c(1, Inf, Inf, Inf, Inf), additional_args = list(y = y_mix), num_chains = 1, seed = 1234) }) # RTMB timing_mix_rtmb <- system.time({ fit_mix_rtmb <- stan_sample(mixture_loglik, inits_mix, lower = c(0, -Inf, -Inf, 0, 0), upper = c(1, Inf, Inf, Inf, Inf), grad_fun = "RTMB", additional_args = list(y = y_mix), num_chains = 1, seed = 1234) }) ``` ## Results ```{r} timing_results_mix <- data.frame( Method = c("Finite Differences", "RTMB"), Time_seconds = c(timing_mix_fd[3], timing_mix_rtmb[3]), Speedup = c(1, timing_mix_fd[3] / timing_mix_rtmb[3]) ) knitr::kable(timing_results_mix, digits = 2, caption = "Performance comparison for Gaussian Mixture") ``` ```{r} summary(fit_mix_rtmb) ``` # Time Series: AR(1) Model An autoregressive model of order 1 (AR(1)) captures temporal dependence: $y_t = \phi y_{t-1} + \epsilon_t$, where $|\phi| < 1$ for stationarity. ## Simulating Data ```{r} set.seed(202) n <- 200 true_phi <- 0.7 true_sigma <- 1 # Simulate AR(1) process y_ar <- numeric(n) y_ar[1] <- rnorm(1, 0, true_sigma / sqrt(1 - true_phi^2)) for (t in 2:n) { y_ar[t] <- true_phi * y_ar[t-1] + rnorm(1, 0, true_sigma) } ``` ## Defining the Log-Likelihood We use `tanh()` to constrain φ to (-1, 1). ```{r} ar1_loglik <- function(pars, y) { phi <- pars[1] # constrain to (-1, 1) sigma <-pars[2] # positive n <- length(y) # First observation from stationary distribution ll <- dnorm(y[1], 0, sigma / sqrt(1 - phi^2), log = TRUE) # Subsequent observations for (t in 2:n) { ll <- ll + dnorm(y[t], phi * y[t-1], sigma, log = TRUE) } ll } ``` ## Performance Comparison ```{r, results=FALSE, message=FALSE, warning=FALSE} inits_ar <- c(0.5, 1) # Finite differences timing_ar_fd <- system.time({ fit_ar_fd <- stan_sample(ar1_loglik, inits_ar, lower = c(-1, 0), upper = c(0, Inf), additional_args = list(y = y_ar), num_chains = 1, seed = 1234) }) # RTMB timing_ar_rtmb <- system.time({ fit_ar_rtmb <- stan_sample(ar1_loglik, inits_ar, lower = c(-1, 0), upper = c(0, Inf), grad_fun = "RTMB", additional_args = list(y = y_ar), num_chains = 1, seed = 1234) }) ``` ## Results ```{r} timing_results_ar <- data.frame( Method = c("Finite Differences", "RTMB"), Time_seconds = c(timing_ar_fd[3], timing_ar_rtmb[3]), Speedup = c(1, timing_ar_fd[3] / timing_ar_rtmb[3]) ) knitr::kable(timing_results_ar, digits = 2, caption = "Performance comparison for AR(1) model") ``` ```{r} summary(fit_ar_rtmb) ``` ## Quick Approximation with Pathfinder RTMB also works with Pathfinder, Stan's fast variational inference method: ```{r, results=FALSE, message=FALSE, warning=FALSE} fit_ar_path <- stan_pathfinder(ar1_loglik, inits_ar, grad_fun = "RTMB", additional_args = list(y = y_ar)) ``` ```{r} summary(fit_ar_path) ```