ST443 Machine Learning and data mining

Week 10


Philipp Sterzinger

p.sterzinger@lse.ac.uk

04 February 2026

Question 1

  1. SVC and SVM.
  1. In the class, we have shown that the SVC corresponds to the following optimisation problem: \[ \underset{(\beta_0, \beta, \xi) \in \Re \times \Re^p \times \Re^n_{\geq 0}}{\min} \, \frac{1}{2} \| \beta \|_2^2 + \rho \sum_{i = 1}^n \xi_i, \quad \text{s.t. } y_i (\beta_0 + \beta^\top x_i) \geq 1 - \xi_i, \, \forall i = 1, \ldots, n \,, \] where \(\rho \geq 0\) is a tuning cost hyperparameter. Explain how \(\rho\) controls the bias-variance trade-off of the SVC.

Question 1.a

Soft-margin SVC

\[ \underset{(\beta_0, \beta, \xi) \in \Re \times \Re^p \times \Re^n_{\geq 0}}{\min} \, \frac{1}{2} \| \beta \|_2^2 + \rho \sum_{i = 1}^n \xi_i, \quad \text{s.t. } y_i (\beta_0 + \beta^\top x_i) \geq 1 - \xi_i, \, \forall i = 1, \ldots, n \,, \]

  • \(\rho\) controls how much we penalise the objective for violations of the margin
  • Large \(\rho\), heavy penalisation, we strongly prefer no violations of margin, so margin will get smaller
  • Small margin means we make less mistakes (low bias), but we have fewer support vectors (higher variance)
  • Large margins means we allow for more mistakes (high bias), but have more support vectors (lower variance)

Question 1.a

Code
library(e1071)
library(plotly)

set.seed(443)

n_per <- 100
X_pos <- cbind(rnorm(n_per,  1.5, 1), rnorm(n_per,  1.5, 1))
X_neg <- cbind(rnorm(n_per, -1.5, 1), rnorm(n_per, -1.5, 1))

df <- rbind(
  data.frame(x1 = X_pos[, 1], x2 = X_pos[, 2], y = "1"),
  data.frame(x1 = X_neg[, 1], x2 = X_neg[, 2], y = "-1")
)
df$y <- factor(df$y, levels = c("-1", "1"))

pad <- 1
xlim <- range(df$x1) + c(-pad, pad)
ylim <- range(df$x2) + c(-pad, pad)

rho_grid <- c(0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20)

line_segment <- function(w, b, level, xlim, ylim, tol = 1e-9) {
  w1 <- w[1]; w2 <- w[2]
  pts <- matrix(numeric(0), ncol = 2)
  
  if (abs(w2) > tol) {
    y <- (level - b - w1 * xlim[1]) / w2
    pts <- rbind(pts, c(xlim[1], y))
    y <- (level - b - w1 * xlim[2]) / w2
    pts <- rbind(pts, c(xlim[2], y))
  }
  if (abs(w1) > tol) {
    x <- (level - b - w2 * ylim[1]) / w1
    pts <- rbind(pts, c(x, ylim[1]))
    x <- (level - b - w2 * ylim[2]) / w1
    pts <- rbind(pts, c(x, ylim[2]))
  }
  
  inside <- pts[,1] >= xlim[1] - 1e-7 & pts[,1] <= xlim[2] + 1e-7 &
    pts[,2] >= ylim[1] - 1e-7 & pts[,2] <= ylim[2] + 1e-7
  pts <- pts[inside, , drop = FALSE]
  
  if (nrow(pts) == 0) return(data.frame(x = numeric(0), y = numeric(0)))
  
  pts <- unique(round(pts, 8))
  
  if (nrow(pts) == 1) return(data.frame(x = pts[1,1], y = pts[1,2]))
  
  if (nrow(pts) > 2) {
    d <- as.matrix(dist(pts))
    ij <- which(d == max(d), arr.ind = TRUE)[1, ]
    pts <- pts[c(ij[1], ij[2]), , drop = FALSE]
  } else {
    pts <- pts[1:2, , drop = FALSE]
  }
  
  data.frame(x = pts[,1], y = pts[,2])
}

all_points <- list()
all_sv <- list()
all_lines <- list()

for (k in seq_along(rho_grid)) {
  rho <- rho_grid[k]
  
  fit <- svm(y ~ x1 + x2, data = df, type = "C-classification",
             kernel = "linear", cost = rho, scale = FALSE)
  
  w <- drop(t(fit$coefs) %*% fit$SV)
  b <- -fit$rho
  
  tmp_points <- df
  tmp_points$rho <- rho
  all_points[[k]] <- tmp_points
  
  sv_idx <- fit$index
  sv_df <- df[sv_idx, ]
  sv_df$rho <- rho
  all_sv[[k]] <- sv_df
  
  bound <- line_segment(w, b,  0, xlim, ylim)
  mpos  <- line_segment(w, b,  1, xlim, ylim)
  mneg  <- line_segment(w, b, -1, xlim, ylim)
  
  if (nrow(bound) > 0) bound$kind <- "boundary"
  if (nrow(mpos)  > 0) mpos$kind  <- "margin+"
  if (nrow(mneg)  > 0) mneg$kind  <- "margin-"
  
  lines_k <- rbind(bound, mpos, mneg)
  if (nrow(lines_k) > 0) {
    lines_k$rho <- rho
    all_lines[[k]] <- lines_k
  }
}

points_all <- do.call(rbind, all_points)
sv_all <- do.call(rbind, all_sv)
lines_all <- do.call(rbind, all_lines)

points_all$rho_f <- factor(points_all$rho, levels = rho_grid)
sv_all$rho_f <- factor(sv_all$rho, levels = rho_grid)
lines_all$rho_f <- factor(lines_all$rho, levels = rho_grid)
lines_all$group <- paste(lines_all$kind, lines_all$rho_f)

pts_pos <- subset(points_all, y == "1")
pts_neg <- subset(points_all, y == "-1")

p <- plot_ly()

p <- add_markers(
  p,
  data = pts_pos,
  x = ~x1,
  y = ~x2,
  frame = ~rho_f,
  marker = list(size = 7, color = "#1f77b4"),
  name = "y = 1",
  showlegend = TRUE
)

p <- add_markers(
  p,
  data = pts_neg,
  x = ~x1,
  y = ~x2,
  frame = ~rho_f,
  marker = list(size = 7, color = "#ff7f0e"),
  name = "y = -1",
  showlegend = TRUE
)

p <- add_markers(
  p,
  data = sv_all,
  x = ~x1,
  y = ~x2,
  frame = ~rho_f,
  marker = list(symbol = "circle-open", size = 16,
                color = "black", line = list(width = 2, color = "black")),
  inherit = FALSE,
  name = "support vectors",
  showlegend = TRUE
)

p <- add_lines(
  p,
  data = subset(lines_all, kind == "boundary"),
  x = ~x,
  y = ~y,
  frame = ~rho_f,
  split = ~group,
  line = list(color = "black", width = 4),
  inherit = FALSE,
  showlegend = FALSE
)

p <- add_lines(
  p,
  data = subset(lines_all, kind == "margin+"),
  x = ~x,
  y = ~y,
  frame = ~rho_f,
  split = ~group,
  line = list(color = "lightgray", width = 2, dash = "dash"),
  inherit = FALSE,
  showlegend = FALSE
)

p <- add_lines(
  p,
  data = subset(lines_all, kind == "margin-"),
  x = ~x,
  y = ~y,
  frame = ~rho_f,
  split = ~group,
  line = list(color = "lightgray", width = 2, dash = "dash"),
  inherit = FALSE,
  showlegend = FALSE
)

p <- layout(
  p,
  xaxis = list(title = "x1", range = xlim),
  yaxis = list(title = "x2", range = ylim, scaleanchor = "x", scaleratio = 1),
  title = "Soft-margin SVC"
)

p <- animation_opts(p, frame = 0, transition = 0, redraw = TRUE)
p <- suppressWarnings(animation_slider(p, currentvalue = list(prefix = "ρ = ")))
p$x$layout$updatemenus <- NULL
p

Question 1.b

  1. Let \(x = (x_1, x_2)^\top \in \Re^2\) and \(\Phi(x) = (x_1, x_2, x_1 x_2)^\top \in \Re^3\). Give the expression for the kernel \(\mathcal{k}(x, x')\) of \(\Phi\).

Show

  • Functional form of kernel

Know

  • \(\Phi(x) = (x_1, x_2, x_1 x_2)^\top\)

  • \(k(x,x') = \Phi(x)^\top \Phi(x')\)

Question 1.b.

\[ \begin{aligned} k(x,x') &= \Phi(x)^\top \Phi(x') \\ &= (x_1, x_2, x_1 x_2)^\top(x_1', x_2', x_1' x_2') \\ &= x_1x_1' + x_2x_2' + x_1x_1'x_2x_2' \,. \end{aligned} \]

Question 1.c

  1. Show that the function \(\mathcal{k}(x, x') = \cos(x - x')\) for \(x, x' \in \Re\) is a kernel for some \(\Phi\), and that \(\mathcal{k}\) is a symmetric and positive definite kernel.

Show

  • \(\cos(x - x')\) is a kernel for some feature map \(\Phi(x)\)

  • \(k(x,x')\) is symmetric and positive definite

Know

  • \(k(x,x') = \Phi(x)^\top \Phi(x')\)

Question 1.c

Symmetry, PSD

For any \(r \in \mathbb{N}\), \(x_1, \ldots, x_r \in \Re\),

\[ \mathcal K = \begin{bmatrix} \mathcal{k}(x_1, x_1) & \mathcal{k}(x_1, x_2) & \ldots & \mathcal{k}(x_1, x_r) \\ \mathcal{k}(x_2, x_1) & \mathcal{k}(x_2, x_2) & \ldots & \mathcal{k}(x_2, x_r) \\ \vdots & \ddots & \ddots & \vdots \\ \mathcal{k}(x_r, x_1) & \mathcal{k}(x_r, x_2) & \ldots & \mathcal{k}(x_r, x_r) \end{bmatrix} \]

is symmetric, and positive semidefinite

Question 1.c

Symmetry

  • Need \(\mathcal{k}(x_1, x_2) = \mathcal{k}(x_2, x_1) \iff \cos(x_1 - x_2) = \cos(x_2 - x_1)\)
Code
xs <- seq(from = -5, to = 5, length.out = 1000)

plot(xs, cos(xs), type = "l", col = "blue", lwd = 2) 
abline(v = 0, col = "grey", linetype = "dash", lty = 2)
  • \(\cos(x) = \cos(-x) = \cos(|x|)\)

  • Consequently \(\cos(x_1 - x_2) = \cos(-(x_1 - x_2)) = \cos(x_2 - x_1)\)

Question 1.c

Feature map

A Fundamental trigonometric identity [NE]:

Video: Cosine of the Difference of Two Angles

Video: Law of cosines

\[ \cos(x - x') = \cos(x) \cos(x') + \sin(x) \sin(x') \]

\[ \begin{aligned} \cos(x - x') &= \cos(x) \cos(x') + \sin(x) \sin(x') \\ &= \begin{bmatrix} \cos(x) & \sin(x) \end{bmatrix} \begin{bmatrix} \cos(x') \\ \sin(x') \end{bmatrix} \\ &= \Phi(x)^\top \Phi(x') \,, \end{aligned} \]

for

\[ \Phi(x) = \begin{bmatrix} \cos(x) \\ \sin(x) \end{bmatrix} \,. \]

Question 1.c

PSD

  • Need: For any \(r, \in \mathbb{N}\), \(x_1, \ldots, x_r \in \Re\), \(v \in \Re^r, v \neq 0\), \(v^\top \mathcal K v \geq 0\)

\[ \begin{aligned} \mathcal K &= \begin{bmatrix} \Phi(x_1)^\top \Phi(x_1) & \Phi(x_1)^\top \Phi(x_2) & \ldots & \Phi(x_1)^\top \Phi(x_r) \\ \Phi(x_2)^\top \Phi(x_1) & \Phi(x_2)^\top \Phi(x_2) & \ldots & \Phi(x_2)^\top \Phi(x_r) \\ \vdots & \ddots & \ddots & \vdots \\ \Phi(x_r)^\top \Phi(x_1) & \Phi(x_r)^\top \Phi(x_2) & \ldots &\Phi(x_r)^\top \Phi(x_r) \end{bmatrix} \\ &= \begin{bmatrix} \Phi(x_1)^\top \\ \Phi(x_2)^\top \\ \vdots \\ \Phi(x_r)^\top \end{bmatrix} \begin{bmatrix} \Phi(x_1) & \Phi(x_2) & \ldots & \Phi(x_r) \end{bmatrix} \end{aligned} \]

Question 1.c

PSD

  • Need: For any \(r, \in \mathbb{N}\), \(x_1, \ldots, x_r \in \Re\), \(v \in \Re^r\), \(v^\top K v \geq 0\)

\[ v^\top \mathcal K v= v^\top [\Phi(x_1), \ldots, \Phi(x_r)]^\top [\Phi(x_1), \ldots, \Phi(x_r)] v = \tilde v^\top \tilde v \geq 0 \,, \]

for \[ \tilde v = [\Phi(x_1), \ldots, \Phi(x_r)] v \,. \]

Question 1.d

  1. Consider the polynomial kernel \[ \mathcal{k}(x, x') = (c + \gamma x^\top x')^d \] for \(d = 2\), \(\gamma > 0\) and \(c \ge 0\), where \(x, x' \in \Re^p\). Show the feature map \(\Phi\) of this kernel.

Show

  • Feature map corresponding to \(\mathcal{k}(x,x') = (c + \gamma x^\top x')^2\)

Know

  • \(\mathcal{k}(x,x') = \Phi(x)^\top \Phi(x')\)

  • Form of kernel

Question 1.d

\[ \begin{aligned} \mathcal{k}(x, y) &= (c + \gamma x^\top y)^2 \\ &= c^2 + 2 c \gamma \sum_{i = 1}^p x_i y_i + \gamma^2 \left(\sum_{i = 1}^p x_i y_i \right)^2 \\ &= c^2 + 2 c \gamma \sum_{i = 1}^p x_i y_i + \gamma^2 \sum_{i = 1}^p\sum_{j = 1}^p x_ix_j y_i y_j \\ &= c^2 + 2 c \gamma \sum_{i = 1}^p x_i y_i + \gamma^2 \sum_{i = 1}^p x_i^2 y_i^2 + 2\gamma^2 \sum_{i = 1}^p\sum_{j > 1}^r x_ix_j y_i y_j \end{aligned} \]

Idea: match to second order polynomial with interactions \[ \begin{multline} \Psi(x) = \begin{bmatrix} x_1, x_2, \ldots, x_p, x_1^2, \ldots, x_p^2, x_1 x_2, \ldots, x_1 x_p, x_2 x_3, \ldots, x_{p-1} x_p \end{bmatrix} \end{multline} \]

Question 1.d

\[ \Phi(x)^\top \Phi(y) = c^2 + 2 c \gamma \sum_{i = 1}^p x_i y_i + \gamma^2 \sum_{i = 1}^p x_i^2 y_i^2 + 2\gamma^2 \sum_{i = 1}^p\sum_{j > 1}^p x_ix_j y_i y_j \]

vs

\[ \Psi(x)^\top \Psi(y) = \sum_{i = 1}^p x_i y_i + \sum_{i = 1}^p x_i^2 y_i^2 + 2\sum_{i=1}^p\sum_{j > 1}^p x_ix_j y_i y_j \]

Therefore

\[ \Phi(x) = \begin{bmatrix} c, \sqrt{2c \gamma} x_1, \ldots, \sqrt{2c \gamma} x_p, \gamma x_1^2 , \ldots,\gamma x_p^2, \gamma x_1 x_2, \ldots, \gamma x_{p-1} x_p \end{bmatrix} \]

Question 1.e

  1. Consider the SVM with a radial basis kernel, which takes the form \[ \mathcal{k}(x, x') = \exp\left(-\gamma \|x - x'\|_2^2\right), \] where \(\gamma > 0\). Explain how the parameter \(\gamma\) influences the bias and variance of the SVM classifier.

Show

  • Influence of \(\gamma\) on bias, variance

Know

  • SVC: \(\psi(x) = \textrm{sign}(\beta_0^* + x^\top \beta^*)\)

  • \(\beta^* = \sum_{i = 1}^n \lambda_i^* y_i x_i\)

  • \(\beta_0^* = -0.5 (\min_{y_i = 1} x_i^\top \beta^* + \min_{y_i = -1} x_i^\top \beta^*)\)

  • SVM: replace all inner products with kernel \(\mathcal{k}(x,x')\)

  • Discriminant based classifier: \(\psi(x) = \textrm{sign}(s(x) + \beta_0)\)

Question 1.e

SVC

\[ x^\top \beta^* = \sum_{i = 1}^p \lambda_i^* y_i x_i^\top x \]

SVM

\[ s(x) = \sum_{i = 1}^p \lambda_i^* y_i k(x_i, x) = \sum_{i = 1}^n \lambda_i^* y_i \exp \left(-\gamma \| x_i - x \|^2 \right) \]

  • If \(\gamma \to \infty\): \(k(x,x_i) \approx 0\) whenever \(x_i \neq x\)
    • Like \(1\)-NN, low bias
    • Can produce highly nonlinear decision boundaries
    • If data change, the decsion boundary will change drastically (high variance)
  • If \(\gamma \to 0\): \(k(x,x_i) \to 1\), so prediction function becomes constant
    • Low variance, high bias
    • Generally, for large gamma, more points around \(x\) will contribute to score, lower variance but bias increases

Learnings

  • How kernels relate to inner product
  • How differnt kernels and hyperparameters give different decision boundaries
  • Soft-margin SVC/SVM cost parameter

Question 2

SVM and XOR data. Let \(f ∶ \{0, 1\}^2 \to \{0, 1\}\) be the exclusive OR function, where $f (a,b) = 1 if (a,b) = (0, 1) or (a,b) = (1, 0), and \(f (a, b) = 0\) otherwise. Consider the XOR dataset \((x_1, y_1), \ldots , (x_4, x_4)\), defined such that

  • \(x_1 = (0, 0)^\top\),
  • \(x_2 = (0, 1)^\top\),
  • \(x_3 = (1, 0)^\top\), and
  • \(x_4 = (1, 1)^\top\),

and \(y_i = f (x_i)\) for \(i = 1,\ldots , 4\).

Question 2

Code
library(ggplot2)

df <- data.frame(
  x = c(0, 0, 1, 1),
  y = c(0, 1, 0, 1),
  col = c("black", "white", "white", "black")
)

ggplot(df, aes(x, y)) +
  geom_point(aes(fill = col), shape = 21, size = 5, colour = "black") +
  scale_fill_identity() +
  coord_fixed() +
  theme_minimal()

The XOR dataset is a classical example of a non-linearly separable dataset.

Question 2.a

  1. Show that \((x_1, y_1), \ldots , (x_4, y_4)\) is not a linearly separable dataset.

Show

  • Nonseparability by hyperplane

Know

  • Definition of separating hyperplane

Question 2

Affine \(1\)-dimensional hyperplane

For \(\beta_0 \in \Re\), \(\beta \in \Re^2, \, \beta \neq 0_2\),

\[ H(\beta_0, \beta) = \{x \in \Re^2: \beta_0 + x^\top \beta = 0 \} \]

Separating hyperplane

\(H(\beta_0, \beta)\) is a separating hyperplane, if for all \(y_i \in \{-1,1\}\),

\[ y_i(\beta_0 + x_i^\top \beta) \geq 0 \]

Question 2

WLOG:

\[ f(x_i) = \begin{cases} 1 & \text{if } \| x \|_1 = 1 \\ -1 & \text{otherwise} \end{cases} \]

  • \(f(x_1) = f([0,0]) = -1\)
  • \(f(x_2) = f([1,0]) = 1\)
  • \(f(x_3) = f([0,1]) = 1\)
  • \(f(x_4) = f([1,1]) = -1\)

Then have \[ \begin{aligned} -(\beta_0 + [0,0] \beta) &\geq 0 \\ \beta_0 + [1, 0] \beta & \geq 0 \\ \beta_0 + [0,1]\beta& \geq 0 \\ -(\beta_0 + [1,1]\beta) & \geq 0 \end{aligned} \]

Question 2

\[ \begin{aligned} -(\beta_0 + [0,0] \beta) &\geq 0 \\ \beta_0 + [1, 0] \beta & \geq 0 \\ \beta_0 + [0,1]\beta& \geq 0 \\ -(\beta_0 + [1,1]\beta) & \geq 0 \end{aligned} \]

Yields

\[ \begin{aligned} \beta_0 &\leq 0 \\ \beta_1 & \geq -\beta_0 \\ \beta_2 & \geq -\beta_0 \\ \beta_0 + \beta_1 + \beta_2 &\leq 0 \end{aligned} \]

But then

\[ \begin{aligned} \beta_0 + \beta_1 + \beta_2 \geq \beta_0 - 2 \beta_0 = - \beta_0 \geq 0 \,, \end{aligned} \]

and would need \(\beta_0 = \beta_1 = \beta_2 = 0\), and \(H(0, [0,0]^\top)\) is not a \(1\)D hyperplane.

Question 2

  1. Consider the SVM using the quadratic polynomial kernel \[ \mathcal{k}(x,x') = (c + \gamma x^\top x')^2 \] for \(\gamma > 0\), \(c \geq 0\). Show that in the transformed feature domain using this kernel, the observation points are linearly separable.

Show

  • Separability of XOR with polynomial kernel

Know

  • Feature map \(\Phi(x)\)

  • Definition of separating hyperplane

Question 2.b

\[ \Phi(x_1,x_2) = \begin{bmatrix} c, & \sqrt{2c\gamma} x_1, & \sqrt{2c\gamma} x_2, & \gamma^2 x_1^2, & \gamma^2 x_1^2, & \sqrt{2}\gamma x_1x_2 \end{bmatrix} \]

Quick and dirty

  • \(c = 0\)
  • \(\gamma = 1\)

\[ \Phi(x_1,x_2) = \begin{bmatrix} x_1^2, & x_1^2, & \sqrt{2}\gamma x_1x_2 \end{bmatrix} \]

  • \(\Phi(0,0) = [0,0,0]^\top\)
  • \(\Phi(1,0) = [1,0,0]^\top\)
  • \(\Phi(0,1) = [0,1,0]^\top\)
  • \(\Phi(1,1) = [1,1,\sqrt{2}]^\top\)

Question 2.a

\[ y_i ( \beta_0 + \beta^\top \Phi(x_i)) \geq 0 \]

\[ \begin{aligned} -(\beta_0 + [0,0,0]\beta) &\geq 0 \\ \beta_0 + [1,0,0]\beta &\geq 0 \\ \beta_0 + [0,1,0]\beta &\geq 0 \\ \beta_0 + [1,1,\sqrt{2}] \beta & \geq 0 \end{aligned} \]

\[ \begin{aligned} \beta_0 &\leq 0 \\ \beta_1 & \geq - \beta_0 \\ \beta_2 & \geq - \beta_0 \\ -(\beta_0 + \beta_1 + \beta_2 + \sqrt{2}\beta_3) &\geq 0 \\ \end{aligned} \]

Take

  • \(\beta_0 = -1/4\)
  • \(\beta_1 = \beta_2 = 1\)
  • \(\beta_3 = - (1 + \sqrt{2})\)

Question 2.a

A separating feature map

Code
library(ggplot2)

df <- data.frame(
  x1 = c(0, 1, 0, 1),
  x2 = c(0, 0, 1, 1),
  y  = factor(c(0, 1, 1, 0))
)

grid <- expand.grid(
  x1 = seq(0, 1, length.out = 200),
  x2 = seq(0, 1, length.out = 200)
)
grid$g <-  -.1/4 + grid$x1^2 + grid$x2^2 - (1 + sqrt(2)) * ( grid$x1 *  grid$x2)

ggplot() +
  geom_raster(data = grid, aes(x1, x2, fill = g > 0), alpha = 0.3) +
  geom_contour(data = grid, aes(x1, x2, z = g), breaks = 0, colour = "black") +
  geom_point(data = df, aes(x1, x2, colour = y), size = 3) +
  scale_fill_manual(values = c("grey", "white"), guide = "none") +
  scale_colour_manual(values = c("black", "lightgrey")) +
  coord_fixed() +
  theme_minimal()

Question 2.a

A separating feature map

Learnings

  • Separating hyperplanes & relation to SVMs
  • Get comfortable with different kernels