Visualising PCA and \(k\)-means clustering

ST443 Machine learning and data mining


Author
Affiliation

Philipp Sterzinger
p.sterzinger@lse.ac.uk

Department of Statistics
London School of Economics and Political Science (LSE)

Published

18 February 2026

In this note, we will look at the visual aspects of PCA and the \(k\)-means clustering algorithm. Both topics are very intuitive graphically once we are able to connect the mathematics to simple pictures. As always, non-examinable topics are marked by 🏔️.

PCA

Our starting point with PCA is unlabelled data \(x_1, \ldots, x_n \in \Re^p\):

R code
library(dplyr)
library(tibble)
library(plotly)

set.seed(443)

c3 <- function(a, b) c(a[2]*b[3] - a[3]*b[2],
                      a[3]*b[1] - a[1]*b[3],
                      a[1]*b[2] - a[2]*b[1])

u <- function(a) a / sqrt(sum(a^2))


cols <- c(data = "#00BFC4", proj = "#F8766D")
grid_col <- "rgba(0,0,0,0.08)"

n <- 50
A <- matrix(c(2.0, 1.0, 0.2,
              0.0, 0.9, 0.1,
              0.0, 0.0, 0.25), 3, 3, byrow = TRUE)
X <- matrix(rnorm(3*n), ncol = 3) %*% A
X <- scale(X, center = TRUE, scale = FALSE)
X <- as.matrix(X)

pc <- prcomp(X, center = FALSE, scale. = FALSE)
v1 <- pc$rotation[,1]
v2 <- pc$rotation[,2]
v3 <- pc$rotation[,3]

xr <- range(X[,1]); yr <- range(X[,2]); zr <- range(X[,3])
pad <- function(r) 0.15 * diff(r)
x_range <- c(xr[1]-pad(xr), xr[2]+pad(xr))
y_range <- c(yr[1]-pad(yr), yr[2]+pad(yr))
z_range <- c(zr[1]-pad(zr), zr[2]+pad(zr))
L <- max(sqrt(rowSums(X^2))) * 1.1

plane_grid <- function(e1, e2, L, m = 50) {
  s <- seq(-L, L, length.out = m)
  g <- expand.grid(s = s, t = s)
  P <- g$s %o% e1 + g$t %o% e2
  list(
    x = matrix(P[,1], nrow = m, ncol = m),
    y = matrix(P[,2], nrow = m, ncol = m),
    z = matrix(P[,3], nrow = m, ncol = m)
  )
}

proj_to_plane <- function(X, n) {
  a <- as.vector(X %*% n)
  X - a %o% n
}

seg_lines <- function(X, Xp, idx) {
  X  <- as.matrix(X)
  Xp <- as.matrix(Xp)
  idx <- idx[idx >= 1 & idx <= nrow(X)]
  M <- do.call(rbind, lapply(idx, function(i) rbind(Xp[i,], X[i,], c(NA, NA, NA))))
  tibble(x = M[,1], y = M[,2], z = M[,3])
}

scene_style <- list(
  xaxis = list(title = "x1", range = x_range, showbackground = TRUE, backgroundcolor = "white",
               gridcolor = grid_col, zeroline = TRUE, zerolinecolor = grid_col),
  yaxis = list(title = "x2", range = y_range, showbackground = TRUE, backgroundcolor = "white",
               gridcolor = grid_col, zeroline = TRUE, zerolinecolor = grid_col),
  zaxis = list(title = "x3", range = z_range, showbackground = TRUE, backgroundcolor = "white",
               gridcolor = grid_col, zeroline = TRUE, zerolinecolor = grid_col),
  aspectmode = "cube"
)

camera0 <- list(eye = list(x = 1.6, y = 1.6, z = 1.2))

dat <- tibble(x = X[,1], y = X[,2], z = X[,3])

plane <- plane_grid(v1, v2, L = L*0.9, m = 25)
Xp <- proj_to_plane(X, v3)
proj_dat <- tibble(x = Xp[,1], y = Xp[,2], z = Xp[,3])
segs <- seg_lines(X, Xp, idx = seq_len(nrow(X)))

vis_data <- c(TRUE, FALSE, FALSE, FALSE)
vis_pca  <- c(TRUE, TRUE,  TRUE,  TRUE)

grey_scale <- list(
  list(0, "rgba(120,120,120,0.35)"),
  list(1, "rgba(120,120,120,0.35)")
)

p <- plot_ly() |>
  add_markers(
    data = dat, x = ~x, y = ~y, z = ~z,
    marker = list(size = 4.5, color = cols["data"], opacity = 0.85),
    name = "data",
    visible = TRUE
  ) |>
  add_surface(
    x = plane$x, y = plane$y, z = plane$z,
    opacity = 0.20,
    showscale = FALSE,
    colorscale = grey_scale,
    showlegend = FALSE,
    hoverinfo = "skip",
    visible = FALSE
  ) |>
  add_trace(
    data = segs, x = ~x, y = ~y, z = ~z,
    type = "scatter3d", mode = "lines",
    line = list(color = "rgba(80,80,80,0.35)", width = 1),
    showlegend = FALSE,
    hoverinfo = "skip",
    visible = FALSE
  ) |>
  add_markers(
    data = proj_dat, x = ~x, y = ~y, z = ~z,
    marker = list(size = 4, color = cols["proj"], opacity = 0.75),
    name = "projection",
    visible = FALSE
  ) |>
  layout(
    title = "Original data",
    scene = scene_style,
    legend = list(x = 1.02, y = 1),
    margin = list(r = 140),
    template = "plotly_white",
    scene_camera = camera0,
    updatemenus = list(
      list(
        type = "buttons",
        direction = "left",
        x = 0, y = 1.10,
        xanchor = "left", yanchor = "top",
        buttons = list(
          list(
            label = "Original data",
            method = "update",
            args = list(list(visible = vis_data),
                        list(title = "Original data"))
          ),
          list(
            label = "Show PCA",
            method = "update",
            args = list(list(visible = vis_pca),
                        list(title = "PCA plane + orthogonal projections"))
          )
        )
      )
    )
  )

p

We would like to find a “good” lower dimensional representation \(z_1,\ldots, z_n \in \Re^d\), \(d < p\).

R code
#| echo: true
#| fig-align: center

n0 <- v3
e1 <- v1
e2 <- u(c3(n0, e1))

Xp <- proj_to_plane(X, n0)

plane0 <- plane_grid(e1, e2, L = L*0.9, m = 17)

idx_seg <- seq_len(min(70, nrow(X)))
segs0 <- seg_lines(X, Xp, idx = idx_seg)

normal0 <- tibble(x = c(0, n0[1]*L*0.9, NA),
                  y = c(0, n0[2]*L*0.9, NA),
                  z = c(0, n0[3]*L*0.9, NA))

p2_static <- plot_ly() |>
  add_trace(
    data = plane0, x = ~x, y = ~y, z = ~z,
    type = "scatter3d", mode = "markers",
    marker = list(size = 2, color = "rgba(70,70,70,0.14)"),
    showlegend = FALSE,
    hoverinfo = "skip"
  ) |>
  add_trace(
    data = normal0, x = ~x, y = ~y, z = ~z,
    type = "scatter3d", mode = "lines",
    line = list(color = "black", width = 4),
    name = "normal",
    hoverinfo = "skip"
  ) |>
  add_trace(
    data = segs0, x = ~x, y = ~y, z = ~z,
    type = "scatter3d", mode = "lines",
    line = list(color = "rgba(80,80,80,0.35)", width = 2),
    showlegend = FALSE,
    hoverinfo = "skip"
  ) |>
  add_markers(
    data = dat, x = ~x, y = ~y, z = ~z,
    marker = list(size = 4.5, color = cols["data"], opacity = 0.85),
    name = "data"
  ) |>
  add_markers(
    data = tibble(x = Xp[,1], y = Xp[,2], z = Xp[,3]),
    x = ~x, y = ~y, z = ~z,
    marker = list(size = 4, color = cols["proj"], opacity = 0.75),
    name = "projection"
  ) |>
  layout(
    title = "Fixed plane: orthogonal projections and reconstruction residuals",
    scene = scene_style,
    legend = list(x = 1.02, y = 1),
    margin = list(r = 120),
    template = "plotly_white",
    scene_camera = camera0
  )

p2_static
Warning: 'layout' objects don't have these attributes: 'scene_camera'
Valid attributes include:
'_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'boxmode', 'barmode', 'bargap', 'mapType'

We might want to do this to reduce noise or computational complexity before passing our data to a supervised ML classifier, or simply to find a lower-dimensional representation that we can visualise to understand patterns in our data.

Before we can cook up any dimension reduction algorithm, we first need to set some ground rules of what we want to achieve, or what we mean by a “good” lower-dimensional representation. There are several approaches, or sets of rules, that all give rise to PCA. We will explore the three most standard motivations. Have a look around and see which one you like best, and perhaps try to find the connection between them.

Minimal reconstruction error

The first pathway to PCA is based on data-recunstruction argument, that should be intuitive and allows us to draw parallels from the geometry of linear regression.

The idea here is based on a very simple principle:

If we replace each \(x_i \in \mathbb{R}^p\) by something living in a \(d\)–dimensional linear subspace (with \(d<p\)), then we should lose as little information as possible.

To make this precise we need to be careful about what we are allowed to do.
If we allowed completely arbitrary maps \(T:\mathbb{R}^p \to \mathbb{R}^p\), we could always take \(T(x)=x\) and get zero error — so we must restrict \(T(x)\) to live in a fixed low-dimensional linear subspace.

So: pick a \(d\)–dimensional subspace \(U \subset \mathbb{R}^p\). For each data point \(x_i\), we approximate it by a point \(\hat x_i \in U\). The natural loss is squared Euclidean error: \[ \sum_{i=1}^n \|x_i - \hat x_i\|_2^2. \]

Step 1: for a fixed subspace, the best reconstruction is orthogonal projection

Fix the subspace \(U\). Among all choices \(\hat x_i \in U\), the best one is the orthogonal projection of \(x_i\) onto \(U\): \[ \hat x_i = P_U x_i, \] where \(P_U\) is the orthogonal projector onto \(U\).

This is exactly the same geometry as least squares regression: the minimiser is found by dropping a perpendicular onto the model space. In particular, the residual \[ r_i := x_i - P_U x_i \] is orthogonal to \(U\), and Pythagoras tells us that any other choice \(\tilde x_i \in U\) gives a larger error: \[ \|x_i-\tilde x_i\|_2^2 = \|x_i-P_U x_i\|_2^2 + \|P_U x_i-\tilde x_i\|_2^2 \;\ge\; \|x_i-P_U x_i\|_2^2. \]

So once we choose the subspace, the “best reconstruction rule” is forced on us.

Step 2: choose the subspace that makes these projection errors as small as possible

Now the only thing left to optimise is the subspace itself: \[ U^\star \in \arg\min_{\substack{U \subset \mathbb{R}^p\\ \dim(U)=d}} \sum_{i=1}^n \|x_i - P_U x_i\|_2^2. \]

To work with this, represent \(U\) by an orthonormal basis. Let \(V \in \mathbb{R}^{p\times d}\) have orthonormal columns (so \(V^\top V = I_d\)). Then \[ P_U = VV^\top, \qquad \hat x_i = VV^\top x_i, \qquad \|x_i - \hat x_i\|_2^2 = \|(I - VV^\top)x_i\|_2^2. \]

So the optimisation becomes \[ V^\star \in \arg\min_{V^\top V = I_d}\ \sum_{i=1}^n \|(I - VV^\top)x_i\|_2^2. \]

If we stack the data into a centred matrix \(X \in \mathbb{R}^{n\times p}\) (row \(i\) is \(x_i^\top\)), this is the same as \[ V^\star \in \arg\min_{V^\top V = I_d}\ \|X - XVV^\top\|_F^2. \]

What this has to do with PCA

That minimisation problem is exactly PCA: the optimal \(V^\star\) is given by the top \(d\) eigenvectors of the empirical covariance matrix \[ \hat\Sigma = \frac{1}{n}X^\top X. \]

Interpretation: - encoding (dimension reduction): \(z_i = V^\top x_i \in \mathbb{R}^d\), - decoding (reconstruction): \(\hat x_i = V z_i = VV^\top x_i\), - and PCA chooses the basis \(V\) that makes the average reconstruction error as small as possible.

(If the original data are not centred, the “best” \(d\)–dimensional set is an affine subspace through the mean; in practice, you centre first and then apply the story above.)

Maximal variance directions

Eigenvalue decomposition

\(k\)-means clustering