ST303 Stochastic Simumlation

Class 3


Philipp Sterzinger

p.sterzinger@lse.ac.uk


Office hours: Thursday, COL 8.05



23 October 2025

Recap

Inverse transform sampling

Given random variable \(X\) with CDF \(F_X\), we want to find a transformation \(T\) of a uniform random variable \(U\) such that \(T(U) \overset{d}{=} X\).

\[ \begin{aligned} F_X(t) &= \Pr \left( X \leq t \right) \\ &= \Pr \left( T(U) \leq t \right) \\ &= \Pr \left(U \leq T^{-1}(t) \right) \\ &= T^{-1}(t) \end{aligned} \] So \(F_X\) is the inverse function of \(T\), or, equivalently, \(T(t) = F_X^{-1}(t)\).

Inverse transform sampling

Code
library(ggplot2) 
library(cowplot)

logistf <- function(x){1 / (1 + exp(-x))} 
inv_logistf <- function(y){log(y / (1 - y))}  
xs <- seq(from = -10, to = 10, length.out = 1000)
ys <- logistf(xs)

set.seed(123)
N <- 10^5
us <- runif(N) 
ls <- inv_logistf(us) 

ind <- 2 
u1 <- us[ind] 
l1 <- ls[ind]


min_u_ind <- which(abs(ys - u1) == min(abs(ys - u1)))
uliney <- rep(u1, length(xs))
ulinex <- seq(from = xs[min_u_ind], to = max(xs), length.out = length(xs))

llinex <- rep(l1, length(xs)) 
lliney <- seq(from = 0, to = u1, length.out = length(xs))


central_plot <- ggplot(data.frame(xs = xs, ys = ys, llinex = llinex, lliney = lliney, ulinex = ulinex, uliney = uliney), aes(x = xs, y = ys, group = 1)) +
    geom_line(size = 1.2) + 
    geom_line(aes(x = ulinex, y = uliney), linetype = "dashed", color = "grey", size = 1) +
    geom_line(aes(x = llinex, y = lliney), linetype = "dashed", color = "grey", size = 1) +
    geom_point(aes(x = 10, y = u1), shape = 1, color = "steelblue", size = 2) + 
    geom_point(aes(x = l1, y = 0.0), shape = 1, color = "tomato", size = 2) + 
    geom_point(aes(x = l1, y = u1), shape = 1, color = "black", size = 2) + 
    theme_minimal() +
    labs(title = "(2) Logistic CDF", ylab = "") + 
    theme(plot.margin = margin(l = 0, t = 0), 
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(), 
        axis.title.y = element_blank())

right_hist <- ggplot(data.frame(us = us), aes(x = us)) +
    geom_histogram(bins = 50, fill = "steelblue", color = "black") +
    theme_minimal() +
    labs(title = "(1) Uniform sample") +
    coord_flip() +
    theme(axis.title = element_blank(),   
    axis.text = element_blank(),    
    axis.ticks = element_blank(),   
    plot.margin = margin(r = 0, t = 0))

bottom_hist <- ggplot(data.frame(ls = ls), aes(x = ls)) +
  geom_histogram(bins = 50, fill = "tomato", color = "black") +
  theme_minimal() +
  scale_y_reverse() +
  labs(y = "(3) Logistic Sample") +
  theme(plot.margin = margin(t = 0, b = 0), 
    axis.title.y = element_text(size = 14,
      color = "black",
      angle = 90,                            
      hjust = 1,
      vjust = 0.5),    
    axis.text.y = element_blank(),  
    axis.title.x = element_blank(),  
    axis.ticks.y = element_blank())

final_plot <- ggdraw() +
  draw_plot(central_plot, x = 0.2, y = 0.2, width = 0.6, height = 0.6) + 
  draw_plot(right_hist, x = 0.8, y = 0.2, width = 0.2, height = 0.6) +     
  draw_plot(bottom_hist, x = 0.2, y = 0, width = 0.6, height = 0.2)    

final_plot

Algorithm

  • Given \(F_X\) compute the inverse \(F_X^{-1}\)
  • Draw \(U \sim \mathcal{U}(0,1)\)
  • Compute \(X = F_X^{-1}(U)\)

Inverse transform sampling

Code
library(dplyr)
library(ggplot2) 
library(cowplot)
library(animation)

logistf <- function(x){1 / (1 + exp(-x))} 
logistf_pdf <- function(x){exp(-x) / (1 + exp(-x))^2}
inv_logistf <- function(y){log(y / (1 - y))}
xs <- seq(from = -10, to = 10, length.out = 1000)
ys <- logistf(xs)
l_dens <- logistf_pdf(xs)
sample_sizes <- seq(50, 5000, by = 50)

animation_data <- lapply(sample_sizes, function(N) {
  set.seed(123)
  us <- runif(N)
  ls <- inv_logistf(us)
  
  ind <- sample(1:N,1)
  u1 <- us[ind]
  l1 <- ls[ind]
  
  min_u_ind <- which(abs(ys - u1) == min(abs(ys - u1)))
  uliney <- rep(u1, length(xs))
  ulinex <- seq(from = xs[min_u_ind], to = max(xs), length.out = length(xs))
  
  llinex <- rep(l1, length(xs))
  lliney <- seq(from = 0, to = u1, length.out = length(xs))
  
  data.frame(
    xs = xs,
    ys = ys,
    ulinex = ulinex,
    uliney = uliney,
    llinex = llinex,
    lliney = lliney,
    us_point = u1,
    ls_point = l1,
    N = N
  )
}) %>% bind_rows()

animation_data2 <- lapply(sample_sizes, function(N) {
  set.seed(123)
  us <- runif(N)
  ls <- inv_logistf(us)
  
  data.frame(
    us = us,
    ls = ls,
    N = N
  )
}) %>% bind_rows()

invisible(saveGIF({
    for(n in unique(animation_data$N)){
        data <- filter(animation_data, N == n)
        data2 <- filter(animation_data2, N == n)

        center_plot <- ggplot(data, aes(x = xs, y = ys)) +
            geom_line(linewidth = 1.2) +
            geom_line(aes(x = ulinex, y = uliney), linetype = "dashed", color = "grey", linewidth = 1) +
            geom_line(aes(x = llinex, y = lliney), linetype = "dashed", color = "grey", linewidth = 1) +
            geom_point(aes(x = max(xs), y = us_point), shape = 1, color = "steelblue", size = 2) +
            geom_point(aes(x = ls_point, y = 0), shape = 1, color = "tomato", size = 2) +
            geom_point(aes(x = ls_point, y = us_point), shape = 1, color = "black", size = 2) +
            labs(title = "(2) Logistic CDF", y = "") +
            theme_minimal() +
            theme(plot.margin = margin(r = 0, t = 0, b = 0, l = 0),
                axis.text.x = element_blank(),  
                axis.title.x = element_blank(),  
                axis.ticks.x = element_blank()) 
            
        right_hist <- ggplot(data2, aes(x = us)) +
            geom_histogram(aes(y = after_stat(density)), 
                bins = 30, 
                fill = "steelblue", 
                color = "black") +
            geom_line(data = data.frame(x = xs, y = rep(1, length(xs))), aes(x = x, y  = y), linetype = "dashed", color = "grey", linewidth = 1) + 
            theme_minimal() +
            scale_x_continuous(limits = c(0, 1)) +  
            scale_y_continuous(limits = c(0, 1.2)) +  
            labs(title = "(1) Uniform sample") +
            coord_flip() +
            theme(plot.margin = margin(r = 0, t = 0, b = 0, l = 0),
                axis.title = element_blank(),   
                axis.text = element_blank(),    
                axis.ticks = element_blank()) 

        bottom_hist <- ggplot(data2, aes(x = ls)) +
            geom_histogram(aes(y = after_stat(density)),
                bins = 30, 
                fill = "tomato", 
                color = "black") +
            geom_line(data = data.frame(x = xs, y = l_dens), aes(x = x, y  = y), linetype = "dashed", color = "grey", linewidth = 1) + 
            theme_minimal() +
            scale_x_continuous(limits = c(-10, 10)) +  
            scale_y_reverse(limits = c(.3, 0)) +
            labs(y = "(3) Logistic Sample") +
            theme(plot.margin = margin(r = 0, t = 0, b = 0, l = 0), 
                axis.title.y = element_text(size = 14,
                color = "black",
                angle = 90,                            
                hjust = 1,
                vjust = 0.5),    
                axis.text.y = element_blank(),  
                axis.title.x = element_blank(),  
                axis.ticks.y = element_blank()) 

        title_plot <- ggplot() + 
            labs(title = paste("Sample Size N =", n)) + 
            theme_void() + 
            theme(
                plot.title = element_text(
                size = 20,          
                face = "bold",      
                hjust = 0.5,        
                vjust = 0.),
                plot.margin = margin(r = 0, t = 0, b = 0, l = 0)
            )

        final_plot <- ggdraw() +
            draw_plot(title_plot, x = 0, y = 0.9, width = 1, height = 0.1) +
            draw_plot(center_plot, x = 0.2, y = 0.2, width = 0.6, height = 0.6) + 
            draw_plot(right_hist, x = 0.8, y = 0.2, width = 0.2, height = 0.6) +     
            draw_plot(bottom_hist, x = 0.2, y = 0, width = 0.6, height = 0.2)   

        print(final_plot)
    }

}, movie.name = "logist_sampling.gif", interval = .2, ani.width = 1000, ani.height = 1000))
Logistic Sampling

Questions

Question 1

Generate two samples of size \(1000\) from the \(\Gamma(2, 1)\) distribution as follows:

  • Sample use rgamma(1000, 2, 1)

  • Sampling using the ’manual’ method, i.e. by calculating: -log(runif(1000)) − log(runif(1000))

Use qqplot to compare them. Then change the first sample to one generated by rgamma(1000, 2.2, 1.1) and repeat without changing the second sample to see what happens.

Code
# sample size 
N <- 10^5
# seed 
set.seed(123)
# gamma sample  
gamma_sample <- rgamma(N, 2, 1)
# manual gamma sample 
u1 <- runif(N) 
u2 <- runif(N) 
manual_gamma_sample <- -log(u1 * u2) 
qqplot(gamma_sample, manual_gamma_sample, 
  main = expression(Gamma(2,1) ~ "Q-Q Plot"),
  col = "steelblue", 
  alpha = 0.5) 

gamma_quantiles <- function(p, shape = 2, rate = 1){
  qgamma(p, shape, rate = rate) 
}

qqline(gamma_sample, distribution = gamma_quantiles, col = "magenta") 

Code
wrong_gamma_sample <- rgamma(N, 2.2, 1.1)
qqplot(wrong_gamma_sample, gamma_sample, 
  main = expression(Gamma(2.1,1.1) ~ "Q-Q Plot"), 
  col = "steelblue", 
  alpha = 0.5) 

qqline(gamma_sample, distribution = gamma_quantiles, col = "magenta") 

Question 2

Use the inverse transform method to generate samples of random variables with the following densities:

  1. \(f(x) = \frac{3x^2}{8}, \quad 0 < x < 2\)

\[ \begin{aligned} F(z) &= \frac{3}{8} \int_0^z x^2 dx \\ &= \left[\frac{3}{8} \frac{x^3}{3} \right]_0^z \\ &= \frac{z^3}{8} - 0 \\ &= \frac{z^3}{8} \end{aligned} \] Thus for \(p \in [0,1]\), \[ F^{-1}(p) = 2 p^{1/3} \]

Question 2

Code
N <- 10^5
set.seed(123) 
U <- runif(N) 

inv_cdf <- function(p){
  2 * p^(1 / 3)
}

X <- inv_cdf(U)

xs <- seq(from = 0, to = 2, length.out = 1000) 
pdfs <- 3 / 8 * xs^2 

hist(X, freq = FALSE, col = "steelblue") 
lines(xs, pdfs, col = "tomato", lty = 1, lwd = 2)
legend("topleft", 
  col = c("steelblue", "tomato"), 
  legend = c("Empirical distribution", "pdf"), 
  lty = c(NA, 1), 
  pch = c(15, NA), 
  lwd = c(NA, 2))

Question 2

  1. \(f(x) = \frac{4x}{(1 + x^2)^3}, \quad x > 0\)

Let \(u = (1 + x^2)\), then \[ \begin{aligned} F(z) &= 4 \int_0^z \frac{x}{(1 + x^2)^3} dx \\ &= 2 \int_1^{1 + z^2} \frac{1}{u^3} du \\ &= \left[- u^{-2} \right]_1^{1 + z^2} \\ &= - \left(\frac{1}{(1 + z^2)^2} - 1 \right)\\ &= 1 - \frac{1}{(1 + z^2)^2} \end{aligned} \]

\(F^{-1}(p) = [(1-p)^{-1/2} - 1]^{1/2}\)

Question 2

Code
N <- 10^5
set.seed(123) 
U <- runif(N) 

inv_cdf <- function(p){
  sqrt(sqrt(1 / (1 - p)) - 1) 
}

X <- inv_cdf(U)

xs <- seq(from = 0, to = 15, length.out = 1000) 
pdfs <- 4 * xs / (1 + xs^2)^3

hist(X, freq = FALSE, col = "steelblue", ylim = c(0,1)) 
lines(xs, pdfs, col = "tomato", lty = 1, lwd = 2)
legend("topright", 
  col = c("steelblue", "tomato"), 
  legend = c("Empirical distribution", "pdf"), 
  lty = c(NA, 1), 
  pch = c(15, NA), 
  lwd = c(NA, 2))

Question 2

  1. \(f(x) = \frac{1}{\pi (1 + x^2)}\)

Recall \(\arctan(t) = \int_0^t \frac{1}{1 + x^2} dx\)

\[ \begin{aligned} F(z) &= \frac{1}{\pi} \int_{-\infty}^z \frac{1}{ 1 + x^2}dx \\ &= \frac{1}{\pi} \left(\int_{-\infty}^0 \frac{1}{1 + x^2}dx + \int_0^z \frac{1}{ 1 + x^2}dx \right) \\ &= \frac{1}{\pi} \left(0 - \lim_\limits{x \to -\infty} \arctan(x) + \arctan(z) \right) \\ &= \frac{1}{\pi} \left(\frac{\pi}{2} + \arctan(z) \right) \\ &= \frac{1}{2} + \frac{\arctan(z)}{\pi} \end{aligned} \]

\(F^{-1}(p) = \tan \left( \pi [p - 1/2] \right)\)

Question 2

Code
N <- 10^5
set.seed(123) 
U <- runif(N) 

inv_cdf <- function(p){
  tan(pi * (p - 0.5))
}

X <- inv_cdf(U)

xs <- seq(from = -inv_cdf(0.99), 
  to = inv_cdf(0.99), 
  length.out = 1000) 

pdfs <- 1 / (pi * (1 + xs^2))


plot(density(X[abs(X) < inv_cdf(0.99)]),
  col = "steelblue", 
  lwd = 2) 
lines(xs, pdfs, col = "tomato", lty = 2, lwd = 1)
legend("topright", 
  col = c("steelblue", "tomato"), 
  legend = c("Estimated pdf", "Theoretical pdf"), 
  lty = c(1, 2), 
  lwd = c(2, 1))

Question 2

  1. \(f(x) = x + \frac{1}{2}, \quad 0 < x < 1\)

\[ F(z) = \int_0^z x + \frac{1}{2} dx = \frac{1}{2} x (1 + x) \]

\[ p = -\frac{1}{2} \pm \sqrt{\frac{1}{4} + 2 p} \]

Code
ps <- seq(from = 0, to = 1, length.out = 1000) 

plot(ps, -.5 + sqrt(0.25 + 2 * ps), col = "steelblue", type = "l", lwd = 2, ylim = c(-2.6,1.6)) 
lines(ps, -.5 - sqrt(0.25 + 2 * ps), col = "tomato", type = "l", lwd = 2) 
legend("right",legend = c("+", "-"), col = c("steelblue", "tomato"), lty = c(1, 1), lwd = c(2, 2))

\(F^{-1}(p) = \sqrt{\frac{1}{4} + 2 p} -\frac{1}{2}\)

Question 2

Code
N <- 10^5
set.seed(123) 
U <- runif(N) 

inv_cdf <- function(p){
  sqrt(0.25 + 2 * p) - 0.5 
}

X <- inv_cdf(U)

xs <- seq(from = 0, 
  to = 1, 
  length.out = 1000) 

pdfs <- xs + 0.5 


hist(X, freq = FALSE, col = "steelblue", ylim = c(0,1.5)) 
lines(xs, pdfs, col = "tomato", lty = 1, lwd = 2)
legend("topleft", 
  col = c("steelblue", "tomato"), 
  legend = c("Empirical distribution", "Theoretical pdf"), 
  lwd = c(2, 2))

Question 2

  1. \(f(x) = \frac{c e^{-cx}}{1 - e^{-c}}\) for \(0<x<1\)

\[ F(z) =\frac{c}{1-e^{-c}} \int_0^z e^{-cx} dx =- \frac{1}{1-e^{-c}} \left[e^{-cx}\right]_0^z = \frac{1 - e^{-cx}}{1 -e^{-c}} \]

\[ F^{-1}(p) = - \frac{1}{c} \log(1 - (1 - e^{-c})p) \]

Question 2

Code
N <- 10^5
set.seed(123) 
U <- runif(N) 

inv_cdf <- function(p, c = 1){
  -log(1 - (1 - exp(-c)) * p) / c
}

X <- inv_cdf(U)

xs <- seq(from = 0, 
  to = 1, 
  length.out = 1000) 

pdfs <- exp(-xs) / (1 - exp(-1))  


hist(X, freq = FALSE, col = "steelblue", ylim = c(0,1.5)) 
lines(xs, pdfs, col = "tomato", lty = 1, lwd = 2)
legend("topright", 
  col = c("steelblue", "tomato"), 
  legend = c("Empirical distribution", "Theoretical pdf"), 
  lwd = c(2, 2))

Question 2

  1. \(f(x) = \frac{1}{\log(2) (x + 1)(x + 2)}\) for \(x > 0\)

\[ \frac{1}{(x + 1)(x + 2)} = \frac{A}{x + 1} + \frac{B}{x + 2} \]

\[ 1 = A(x + 2) + B (x + 1) = (A + B)x + (2A + B) \]

The last term on the right describes a line with intercept \(2A + B\) and slope \(1\).

The leftmost term is a line with intercept \(1\) and slope \(0\).

\[ \begin{aligned} A + B & = 0 \\ 2A + B &= 1 \end{aligned} \implies \begin{aligned} A &= 1 \\ B &= -1 \end{aligned} \] so that \[ \frac{1}{(x + 1)(x + 2)} = \frac{1}{x + 1} - \frac{1}{x + 2} \]

Question 2

\[ \begin{aligned} F(z) &= \frac{1}{\log(2)} \int_0^z\frac{1}{(x + 1)(x + 2)} dx \\ &= \frac{1}{\log(2)} \int_0^z\frac{1}{x + 1} - \frac{1}{x + 2} dx \\ &= \frac{1}{\log(2)} \left[\log(x + 1) - \log(x + 2) \right]_0^z \\ &= \frac{1}{\log(2)} \log \left(\frac{z + 1}{z + 2} \right) + 1 \end{aligned} \]

Question 2

\[ \begin{aligned} p = \frac{1}{\log(2)} \log \left(\frac{x + 1}{x + 2} \right) + 1 &\iff (p-1) \log(2) = \log \left(\frac{x + 1}{x + 2} \right) \\ & \iff \exp \{(p-1) \log(2) \} = \frac{x + 1}{x + 2} \\ & \iff 2^{p-1} (x + 2) = x + 1 \\ & \iff x (2^{p - 1} - 1) = 1 - 2^{p} \\ & \iff x = \frac{1 - 2^{p}}{2^{p-1} - 1} \end{aligned} \]

\[ F^{-1}(p) = \frac{1 - 2^{p}}{2^{p-1} - 1}\]

Question 2

Code
N <- 10^5
set.seed(123) 
U <- runif(N) 

inv_cdf <- function(p){
  (1 - 2^p) / (2^(p - 1) - 1) 
}

X <- inv_cdf(U)

xs <- seq(from = 0, 
  to = inv_cdf(0.99), 
  length.out = 1000) 

pdfs <- 1 / log(2) * (1 / (1 + xs) - 1 / (2 + xs))


hist(X[X < inv_cdf(0.99)], freq = FALSE, col = "steelblue", ylim = c(0,.1), xlim = c(0, 50)) 
lines(xs, pdfs, col = "tomato", lty = 1, lwd = 2)
legend("topright", 
  col = c("steelblue", "tomato"), 
  legend = c("Empirical distribution", "Theoretical pdf"), 
  lwd = c(2, 2))

Question 3

Generate a sample from the random variable with density \[ f (x) = (1 − x)(1 + 3x), \quad 0 < x < 1\,. \]

Hint: Find the distribution function and express the random variable as the minimum of two other independent ones.

CDF

\[ \begin{aligned} F(z) &= \int_0^z (1-x) (1 + 3x) dx \\ &= \int_0^z 1 + 2x - 3x^2 dx \\ &= \left[x + x^2 - x^3 \right]_0^z \\ &= z + z^2 - z^3 \end{aligned} \]

Question 3

Minimum of two independent random variables

Consider two independent random variables \(V_1, V_2\). Distribution of minimum \(\min \{V_1, V_2\}\) is \[ \begin{aligned} F_{\min \{V_1, V_2\}}(x) &= \Pr \left(\min \{V_1, V_2\} \leq x \right) \\ &= 1 - \Pr \left( \min \{V_1, V_2\} > x \right) \\ &= 1 - \Pr \left(V_1 > x \, \land V_2 > x \right) \\ &= 1 - \Pr \left(V_1 > x \right) \Pr \left( V_2 > x \right) \\ &= 1 - (1 - F_{V_1}(x)) (1 - F_{V_2}(x)) \end{aligned} \]

Original CDF

\[ \begin{aligned} F(x) &= x +x^2 - x^3 \\ &= 1 - (1 - x - x^2 + x^3) \\ &= 1 - (1 - x) (1 - x^2) \end{aligned} \]

Question 3

\(F_{V_1}(x)\)

If \(U_1 \sim \mathcal{U}(0,1)\), then \(V_1 = U_1\) has distribution \(F_{V_1}(x) = x\)

\(F_{V_2}(x)\)

Inverse transform sampling: If \(U_2 \sim \mathcal{U}(0,1)\), then \(V_2 = F_{V_2}^{-1}(U_2) = \sqrt{U_2}\) has CDF \(F_{V_2}\).

Hence, for independent uniform random variables \(U_1, U_2\), \(\min \{U_1, \sqrt{U_2} \}\) has the desired CDF \(F(x) = 1 - (1 - x)(1 - x^2)\).

Question 3

Code
N <- 10^5
set.seed(123) 
U_1 <- runif(N) 
U_2 <- sqrt(runif(N))

V <- pmin(U_1, U_2) 

xs <- seq(from = 0, 
  to = 1, 
  length.out = 1000) 

pdfs <- (1 - xs) * (1 + 3 * xs) 


hist(V, freq = FALSE, col = "steelblue") 
lines(xs, pdfs, col = "tomato", lty = 1, lwd = 2)
lines(density(V), , col = "magenta", lty = 2, lwd = 2)
legend("topright", 
  col = c("steelblue", "tomato", "magenta"), 
  legend = c("Empirical distribution", "Theoretical pdf", "Estimated pdf"), 
  lwd = c(2, 2, 2), 
  lty = c(1, 1, 2))