An Introduction to Phylogeography

Author

Paul Bastide

Published

October 17, 2025

Analysis of the West Nile Virus (WNV) data with the Brownian Motion

In this practical script, we will analysis the West Nile Virus (WNV) data from Pybus et al. 2012, assuming a (strict) Brownian Motion model of geographical spread, and a fixed phylogenetic tree.

This script uses R, with the following packages, that you can install with the command instal.packages:

library(ape)     # for phylogenetic tree manipulation
library(ggplot2) # for plotting results
library(ggExtra) # for ggMarginal bivariate plot

The Brownian Motion (BM)

The goal of this first section is to simulate the trajectory of a simple Brownian Motion (BM).

Univariate BM

The univariate BM with unit variance $(W_t)_{t \geq 0}$ is the continuous time process with stationary, independent increments that is such that:

$W_0 = 0$;
$W_t \mid W_s \sim \mathcal{N}(W_s, t-s)$ for any $t \geq s \geq 0$.

To simulate a trajectory of this process between $0$ ant $t_e$, we:

choose a regular grid on $[0,t_e]$ with step $dt = t_e / N$;
set $W_0 = 0$;
simulate increments $dW_n = W_{nt} - W_{(n-1)t} \sim \mathcal{N}(0, dt)$ for $1 \leq n \leq N$;
write $W_{n} = \sum_{i=1}^n dW_i$ for $1 \leq n \leq N$.

Justify this construction.

To simulate this process in R, we use the following code (to be completed).

Write a function simulate_bm(t_e,N) that simulates a BM on $N$ points of the interval $[0,t_e]$. The function should return a dataframe, with a column containing the time, and a column containing the corresponding simulated values.

simulate_bm <- function(t_e, N) {
  # 1. choose a regular grid on [0,T] with step dt = t_e / N
  dt <- t_e / N
  time <- seq(0, t_e, dt)
  # 2. define data frame and set W_0 = 0
  traj <- data.frame(time = time, value = rep(NA, N+1))
  [...]
  # 3. simulate increments dW_n \sim N(0, dt) for 1 <= n <= N
  [...]
  # 4. write W_n = \sum_{i=1}^n dW_i for 1 <= n <= N in the dataframe
  [...]
  # return dataframe
  return(traj)
}

simulate_bm <- function(t_e, N) {
  # 1. choose a regular grid on [0,T] with step dt = t_e / N
  dt <- t_e / N
  time <- seq(0, t_e, dt)
  # 2. define data frame and set W_0 = 0
  traj <- data.frame(time = time, value = rep(NA, N+1))
  traj$value[1] <- 0
  # 3. simulate increments dW_n ~ N(0, dt) for 1 <= n <= N
  dW <- rep(NA, N)
  for (n in 1:N) {
    dW[n] <- rnorm(1, mean = 0, sd = sqrt(dt))
  }
  # 4. write W_n = \sum_{i=1}^n dW_i for 1 <= n <= N in the dataframe
  traj$value[-1] <- cumsum(dW)
  # return dataframe
  return(traj)
}

We can then simulate and plot the trajectory.

t_e <- 10
N <- 1000
traj <- simulate_bm(t_e, N)
ggplot(traj, aes(x = time, y = value)) + geom_line()

How can we simulate a BM with a given variance $\sigma^2$, and starting at a given value $\mu$ ?

sigma2 <- 0.01
mu <- 10.3
traj <- simulate_bm(t_e, N)
traj$value <- mu + sqrt(sigma2) * traj$value
ggplot(traj, aes(x = time, y = value)) + geom_line()

What is the distribution of $W_{t_e}$ ? Build an histogram to show it empirically.

nrep <- 1000
W_end <- data.frame(it = 1:nrep, value = rep(NA, nrep))

for (i in 1:nrep) {
  ## Fill in value
}

## Plot
ggplot(W_end, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 50) +
  stat_function(fun = dnorm, args = c([...], [...]))

nrep <- 1000
W_end <- data.frame(it = 1:nrep, value = rep(NA, nrep))
for (i in 1:nrep) {
  W_end$value[i] <- simulate_bm(t_e, N)$value[N]
}
ggplot(W_end, aes(x = value)) + 
  geom_histogram(aes(y = after_stat(density)), bins = 50) +
  stat_function(fun = dnorm, args = c(0, sqrt(t_e)))

Bi-variate BM

The bi-variate BM with unit variance $(\mathbf{W}_t)_{t \geq 0}$ is the continuous time process with stationary, independent increments that is such that:

$\mathbf{W}_0 = \mathbf{0}$;
$\mathbf{W}_t \mid \mathbf{W}_s \sim \mathcal{N}(\mathbf{W}_s, (t-s) \mathbf{I}_2)$ for any $t \geq s \geq 0$.

Note that this process is simply made of two independent BM on the $x$ and $y$ axes.

Write a function simulate_bm_bi(t_e,N) that simulates a BM on $N$ points of the interval $[0,t_e]$. The function should return a dataframe, with a column containing the time, and a column containing the corresponding simulated x and y values.

simulate_bm_bi <- function(t_e, N) {
  [...]
  return(traj)
}

simulate_bm_bi <- function(t_e, N) {
  # simulate x axis
  traj_x <- simulate_bm(t_e, N)
  # simulate y axis
  traj_y <- simulate_bm(t_e, N)
  # merge
  traj <- merge(traj_x, traj_y, by = "time")
  return(traj)
}

We can then simulate and plot the trajectory.

traj <- simulate_bm_bi(t_e, N)
ggplot(traj, aes(x = value.x, y = value.y)) + geom_path()

How can we simulate a BM with a given variance $\Sigma$, and starting at a given value $\boldsymbol{\mu}$ ?

sigma2 <- matrix(c(0.55, 0.45, 0.45, 0.55), ncol = 2)
chol_sigma2 <- chol(sigma2)
stopifnot(isTRUE(all.equal(t(chol_sigma2) %*% chol_sigma2, sigma2)))
mu <- c(10.3, -2.3)
traj <- simulate_bm_bi(t_e, N)
traj[, c(2, 3)] <- sweep(as.matrix(traj[, c(2, 3)]) %*% chol_sigma2, 2, -mu)
ggplot(traj, aes(x = value.x, y = value.y)) + geom_path()

What is the distribution of $\mathbf{W}_{t_e}$ ? Build an histogram to show it empirically.

nrep <- 1000
W_end <- data.frame(it = 1:nrep, value = rep(NA, nrep))

for (i in 1:nrep) {
  ## Fill in value
}

## Plot
ggplot(W_end, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 50) +
  stat_function(fun = dnorm, args = c([...], [...]))

nrep <- 1000
W_end <- data.frame(it = 1:nrep, value.x = rep(NA, nrep), value.y = rep(NA, nrep))
for (i in 1:nrep) {
  traj <- simulate_bm_bi(t_e, N)
  traj[, c(2, 3)] <- sweep(as.matrix(traj[, c(2, 3)]) %*% chol_sigma2, 2, -mu)
  W_end[i, c(2, 3)] <- traj[N, c(2, 3)]
}
datEllipse <- ellipse::ellipse(t_e * sigma2, center = mu, level = 0.95)
p <- 
  ggplot(W_end, aes(x = value.x, y = value.y)) +
  geom_point() + 
  geom_point(data = data.frame(x = mu[1], y = mu[2]), aes(x = x, y = y), size = 4, col = "red") + 
  geom_path(aes(x = x, y = y), data = datEllipse, linewidth = 2, linetype = 2, col = "red") + 
  coord_fixed(ratio = 1) +
  theme_bw()
ggMarginal(p, type = "histogram")

BM on a tree

We can then simulate a BM on a phylogenetic tree.

We first need to simulate a tree using function rphylo from the ape package:

set.seed(1899)              # set seed for reproducibility
tree <- rphylo(5, 0.5, 0.1) # simulate a tree
tree                        # show tree object


Phylogenetic tree with 5 tips and 4 internal nodes.

Tip labels:
  t1, t2, t3, t4, t5

Rooted; includes branch length(s).

The phylo object contains all the information needed about the tree (see help(phylo)), including branch lengths, which we take as given in units of time. We can re-order the tree so that its edges are numbered from the root to the tips (in a preorder).

tree <- reorder(tree)
plot(tree)
edgelabels()
nodelabels()
tiplabels()

The tree$edge field is a table with 2 columns and as many rows as there are edges. Each line gives the number of the parent node and the child node of the edge.

Give the parent node and child node of edge number 5. Find it on the tree.

Edge number 3 is the third row of tree$edge:

tree$edge[5, ]

[1] 7 5

Its parent node is:

tree$edge[5, 1]

[1] 7

and its child node is:

tree$edge[5, 2]

[1] 5

Find the parent node on the tree of node number 3. Find it on the tree.

To find the parent node of node number 3, we need to find it in the second column of tree$edge:

which(tree$edge[, 2] == 3) # find the edge that ends at node 3

[1] 3

The parent of node 3 is then given by the first column of this edge:

tree$edge[tree$edge[, 2] == 3, 1] # find the parent node of node 3

[1] 9

Find the edge that is just before edge number 5 on the tree. What about the edge that is just before edge number 1 ? Find them on the tree.

To find the edge that is just before edge number 3 on the tree, we need to find the edge that ends where edge 3 begins:

which(tree$edge[5, 1] == tree$edge[, 2])

[1] 1

We do the same for edge 1 :

which(tree$edge[1, 1] == tree$edge[, 2])

integer(0)

and find that this edge has no parent edge, as it starts at the root of the tree.

What is the length of edge number 3 ?

The length of edge number 3 is given by:

tree$edge.length[3]

[1] 0.1571309

To simulate a BM on the tree, we need to traverse the branches of the tree in a preorder. On a given branch, we can simulate a BM in time. When the branch ends at a given node, it splits into two branches, and we can start two new independent BM, starting at the ending value of the current branch.

Simulate a univariate BM on the tree generated above by traversing the edges of the tree.

## choose a time step
dt <- 0.001
## choose starting value
mu <- 1.3
## Create a data frame with the simulated trajectory
dftree <- data.frame(value = numeric(0),       # position of the BM
                     time = numeric(0),        # time elapsed since root
                     edge = numeric(0))        # number of the edge of the tree
## traverse the tree from edge to edge
for (i in 1:nrow(tree$edge)) {
  # simulate a BM starting at (0,0) on the current branch
  t_branch <- tree$edge.length[i]
  N_branch <- floor(t_branch / dt)
  current_BM <- [...]
  # find the parent edge of the starting node of the current edge
  parent_edge <-[...]
  # case where current edge does not a parent edge
  if (length(parent_edge) == 0) {
    # add starting value mu
    current_BM$value <- [...]s
  } else { # case where current edge do have a parent edge
    # find last value on the parent branch
    dfparent <- dftree[dftree$edge == parent_edge, ]
    last_parent <- dfparent[nrow(dfparent), ]
    # add last value of parent as starting value of the branch
    current_BM$value <- [...]
    current_BM$time <- [...]
  }
  # fill in the dataframe
  dfbranch <-  data.frame(value = current_BM$value,
                          time = current_BM$time,
                          edge = i)
  # update dftree
  dftree <- rbind(dftree, dfbranch)
}

## choose a time step
dt <- 0.001
## choose starting value
mu <- 1.3
## Create a data frame with the simulated trajectory
dftree <- data.frame(value = numeric(0),       # position of the BM
                     time = numeric(0),        # time elapsed since root
                     edge = numeric(0))        # number of the edge of the tree
## traverse the tree from edge to edge
for (i in 1:nrow(tree$edge)) {
  # simulate a BM starting at (0,0) on the current branch
  t_branch <- tree$edge.length[i]
  N_branch <- floor(t_branch / dt)
  current_BM <- simulate_bm(t_branch, N_branch)
  # find the parent edge of the starting node of the current edge
  parent_edge <- which(tree$edge[i, 1] == tree$edge[, 2])
  # case where current edge does not a parent edge
  if (length(parent_edge) == 0) {
    # add starting value mu
    current_BM$value <- current_BM$value + mu
  } else { # case where current edge do have a parent edge
    # find last value on the parent branch
    dfparent <- dftree[dftree$edge == parent_edge, ]
    last_parent <- dfparent[nrow(dfparent), ]
    # add last value of parent as starting value of the branch
    current_BM$value <- current_BM$value + last_parent$value
    current_BM$time <- current_BM$time + last_parent$time
  }
  # fill in the dataframe
  dfbranch <-  data.frame(value = current_BM$value,
                          time = current_BM$time,
                          edge = i)
  # update dftree
  dftree <- rbind(dftree, dfbranch)
}

We can then plot the univariate BM in time.

p <- ggplot(dftree, aes(time, value, color = as.factor(edge))) +
  geom_path(linewidth = 1)
p

Compare the simulation with the tree structure.

We can then simulate a bi-variate BM on the tree.

Using a similar recursion on the tree, simulate a bivariate BM on the tree generated above.

## choose a time step
dt <- 0.001
## choose starting value
mu <- c(1.3, -2.4)
## Create a data frame with the simulated trajectory
dftree <- data.frame(value.x = numeric(0),     # x position of the BM
                     value.y = numeric(0),     # y position of the BM
                     time = numeric(0),        # time elapsed since root
                     edge = numeric(0))        # number of the edge of the tree
## traverse the tree from edge to edge
for (i in 1:nrow(tree$edge)) {
  [...]
}

## choose a time step
dt <- 0.001
## choose starting value
mu <- c(1.3, -2.4)
## Create a data frame with the simulated trajectory
dftree <- data.frame(value.x = numeric(0),     # x position of the BM
                     value.y = numeric(0),     # y position of the BM
                     time = numeric(0),        # time elapsed since root
                     edge = numeric(0))        # number of the edge of the tree
## traverse the tree from edge to edge
for (i in 1:nrow(tree$edge)) {
  # simulate a BM starting at (0,0) on the current branch
  t_branch <- tree$edge.length[i]
  N_branch <- floor(t_branch / dt)
  current_BM <- simulate_bm_bi(t_branch, N_branch)
  # find the parent edge of the starting node of the current edge
  parent_edge <- which(tree$edge[i, 1] == tree$edge[, 2])
  # case where current edge does not a parent edge
  if (length(parent_edge) == 0) {
    # add starting value mu
    current_BM$value.x <- current_BM$value.x + mu[1]
    current_BM$value.y <- current_BM$value.y + mu[2]
  } else { # case where current edge do have a parent edge
    # find last value on the parent branch
    dfparent <- dftree[dftree$edge == parent_edge, ]
    last_parent <- dfparent[nrow(dfparent), ]
    # add last value of parent as starting value of the branch
    current_BM$value.x <- current_BM$value.x + last_parent$value.x
    current_BM$value.y <- current_BM$value.y + last_parent$value.y
    current_BM$time <- current_BM$time + last_parent$time
  }
  # fill in the dataframe
  dfbranch <-  data.frame(value.x = current_BM$value.x,
                          value.y = current_BM$value.y,
                          time = current_BM$time,
                          edge = i)
  # update dftree
  dftree <- rbind(dftree, dfbranch)
}

p <- ggplot(dftree, aes(value.x, value.y, color = as.factor(edge))) +
  geom_path(linewidth = 1) + 
  coord_fixed(ratio = 1)
p

Analysis of the WNV dataset

Loading the data

We use the data available in this GitHub repository.

We assume here that a phylogenetic tree of the virus has already been inferred, based on genetic data. It can be loaded in R using function read.tree from package ape.

tree <- read.tree("https://raw.githubusercontent.com/pbastide/cauchy_paper/refs/heads/main/data/Pybus_et_al_2012_Data/WNV_GTR_gamma_MCC.newick")
tree


Phylogenetic tree with 104 tips and 103 internal nodes.

Tip labels:
  AF202541_Hs_1999.66, FJ151394_Cb_1999, DQ164188_Cb_2003, AF404756_Cb_2000, DQ164194_Cb_2001, AF404754_Cp_2000, ...

Rooted; includes branch length(s).

Each tip of the tree represents a variant of the virus sampled in one individual. The tree has time information in years, so that it can be dated, as in the plot below.

plot(tree, show.tip.label = FALSE)
axisPhylo(root.time = 1998.394, backward = FALSE)

We then need to know the geographical localisation of the virus when they were sampled. These are recorded as latitude and longitude information in the following table:

dat <- read.table("https://raw.githubusercontent.com/pbastide/cauchy_paper/refs/heads/main/data/Pybus_et_al_2012_Data/WNV_lat_long.txt",
                  header = TRUE)
head(dat)

            traits   lat    long
1 WG007_Hs_2005.59 31.82 -106.56
2 WG009_Hs_2005.67 32.28 -106.74
3 WG011_Hs_2006.66 31.78 -106.50
4 WG013_Hs_2007.48 31.84 -106.53
5 WG024_Hs_2003.53 33.81 -117.83
6 WG080_Hs_2004.56 34.22 -118.44

The names in dat$traits should match the tip labels of the tree in tree$tip.label. Using function match, re-order the lines of dat to make sure they are in the same as the tip labels. Then plot the tree and data using function phydataplot.

## Match data and tree
match(tree$tip.label, dat$traits)
[...]

## Plot tree and data
plot(tree, show.tip.label = FALSE, x.lim = 25)
phydataplot(dat[, "lat", drop = F], tree, offset = 1, scaling = 0.1)
phydataplot(dat[, "long", drop = F], tree, offset = 15, scaling = 0.05)

## Match data and tree
dat <- dat[match(tree$tip.label, dat$traits), ]
rownames(dat) <- dat$traits

## Plot tree and data
plot(tree, show.tip.label = FALSE, x.lim = 25)
phydataplot(dat[, "lat", drop = F], tree, offset = 1, scaling = 0.1)
phydataplot(dat[, "long", drop = F], tree, offset = 15, scaling = 0.05)

The two traits correspond to latitude and longitude. To plot these points on a map, we can use packages sf and rnaturalearth.

library("sf")

Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE

library("rnaturalearth")
north_america <- ne_countries(scale = "medium",
                              returnclass = "sf",
                              continent = "north america")
ggplot(data = north_america) +
  geom_sf() + 
  coord_sf(xlim = c(-140, -60), ylim = c(20, 55), expand = FALSE) + 
  geom_point(data = dat, aes(x = long, y = lat))

This map represents the position of the sampled viruses, at all times. Our question are:

where did the virus emerge for the first time ?
how did it spread on the map ?

Root position

To find the most probable root position, we fit a multivariate BM to the data.

Recall that, using the BM, the matrix of data has the matrix normal distribution: \[ \mathbf{Y} \sim \mathcal{MN}_{n,2}(\mathbf{1}_n\boldsymbol{\mu}^T, \mathbf{V}, \mathbf{\Sigma}) \] with $\boldsymbol{\mu}$ the vector of the root position, $\mathbf{\Sigma}$ the variance matrix of the BM, and $\mathbf{V}$ the matrix given by the tree.

The maximum likelihood formulas for the parameters are then: \[ \hat{\boldsymbol{\mu}}^T = (\mathbf{1}_n^T \mathbf{V}^{-1} \mathbf{1}_n)^{-1} \mathbf{1}_n^T \mathbf{V}^{-1} \mathbf{Y} \] \[ \hat{\mathbf{\Sigma}} = \frac{1}{n-1} (\mathbf{Y} - \mathbf{1}_n\hat{\boldsymbol{\mu}}^T)^T \mathbf{V}^{-1} (\mathbf{Y} - \mathbf{1}_n\hat{\boldsymbol{\mu}}^T) \]

The tree matrix $\mathbf{V}$ can be obtained from ape function vcv:

V <- vcv(tree)
V[1:3,1:3]

                    AF202541_Hs_1999.66 FJ151394_Cb_1999 DQ164188_Cb_2003
AF202541_Hs_1999.66            1.266477        0.0000000        0.0000000
FJ151394_Cb_1999               0.000000        1.0295882        0.4775296
DQ164188_Cb_2003               0.000000        0.4775296        4.8759063

What are the interpretations of the coefficients of $\mathbf{V}$ ? On this tree, can you find the year of the root ? What does this mean for the epidemic ?

$V_{ij}$ is the time between the root and the most recent ancestor of tips $i$ and $j$. On the diagonal, $V_{ii}$ is the time between the root and the tip $i$. Because there are dates associated with each tip, we can find the year of the root with a subtraction:

diag(V)[c(1, 8, 12)]

AF202541_Hs_1999.66 AY289214_Hs_2002.62 AF196835_Pc_1999.71 
           1.266477            4.226477            1.316477

1999.66 - diag(V)[1]

AF202541_Hs_1999.66 
           1998.394

2002.62 - diag(V)[8]

AY289214_Hs_2002.62 
           1998.394

1999.71 - diag(V)[12]

AF196835_Pc_1999.71 
           1998.394

This means that the epidemic first appear in the US in year $1998.394$.

(Bonus: what does the .394 means ?)

Implement the formulas above to find $\hat{\boldsymbol{\mu}}$ and $\hat{\mathbf{\Sigma}}$.

## data
Y <- as.matrix(dat[, c(2, 3)])
ndata <- nrow(Y)
ones = rep(1, ndata)

Vinv <- [...]

## mu hat
mu_hat <- [...]

## Sigma hat
sigma_hat <- [...]

## data
Y <- as.matrix(dat[, c(2, 3)])
ndata <- nrow(Y)
ones = rep(1, ndata)

Vinv <- solve(V)

## mu hat
mu_hat <- solve(t(ones) %*% Vinv %*% ones) %*% t(ones) %*% Vinv %*% Y

## Sigma hat
sigma_hat <- 1/(ndata-1) * t(Y - ones %*% mu_hat) %*% Vinv %*% (Y - ones %*% mu_hat)

Plot the inferred root position on the map. Where did the virus emerge in the US ? Does this make sense ?

## data
ggplot(data = north_america) +
  geom_sf() + 
  coord_sf(xlim = c(-140, -60), ylim = c(20, 55), expand = FALSE) + 
  geom_point(data = dat, aes(x = long, y = lat)) +
  geom_point(data = mu_hat, aes(x = long, y = lat), col = "red")

Ancestral state reconstruction

Writing $\mathbf{Z}$ the matrix of states at ancestral nodes, remind the formula: \[ \begin{pmatrix} \mathbf{Z}_{\cdot,1}\\ \mathbf{Z}_{\cdot,2}\\ \mathbf{Y}_{\cdot,1}\\ \mathbf{Y}_{\cdot,2} \end{pmatrix} \sim \mathcal{N}\left( \begin{pmatrix} \mu_1\mathbf{1}_m\\ \mu_2\mathbf{1}_m\\ \mu_1\mathbf{1}_n\\ \mu_2\mathbf{1}_n \end{pmatrix} , \begin{pmatrix} \Sigma_{11}\mathbf{V}_{ZZ} & \Sigma_{12}\mathbf{V}_{ZZ} & \Sigma_{11}\mathbf{V}_{ZY} & \Sigma_{12}\mathbf{V}_{ZY}\\ \Sigma_{21}\mathbf{V}_{ZZ} & \Sigma_{22}\mathbf{V}_{ZZ} & \Sigma_{21}\mathbf{V}_{ZY} & \Sigma_{22}\mathbf{V}_{ZY}\\ \Sigma_{11}\mathbf{V}_{YZ} & \Sigma_{12}\mathbf{V}_{YZ} & \Sigma_{11}\mathbf{V}_{YY} & \Sigma_{12}\mathbf{V}_{YY}\\ \Sigma_{21}\mathbf{V}_{YZ} & \Sigma_{22}\mathbf{V}_{YZ} & \Sigma_{21}\mathbf{V}_{YY} & \Sigma_{22}\mathbf{V}_{YY} \end{pmatrix} \right) \]

In the formula above, we need the full $\mathbf{V}$ matrix, for both the $\mathbf{Z}$ and the $\mathbf{Y}$.

We first label all the nodes in the tree:

tree <- makeNodeLabel(tree)
tree


Phylogenetic tree with 104 tips and 103 internal nodes.

Tip labels:
  AF202541_Hs_1999.66, FJ151394_Cb_1999, DQ164188_Cb_2003, AF404756_Cb_2000, DQ164194_Cb_2001, AF404754_Cp_2000, ...
Node labels:
  Node1, Node2, Node3, Node4, Node5, Node6, ...

Rooted; includes branch length(s).

Then we compute the full matrix. This time, we construct it naively by hand.

## Get the number of tips and internal nodes
ntips <- length(tree$tip.label)
nnodes <- length(tree$node.label)
## Contruct the void matrix
Vfull <- matrix(NA, ncol =  nnodes + ntips, nrow = nnodes + ntips)
## Name it with node first, then tips
tipandnodeslabels <- c(tree$node.label, tree$tip.label)
colnames(Vfull) <- rownames(Vfull) <- tipandnodeslabels
## get all ancestors for each pair of node / tip
all_ancestors <- mrca(tree, full = TRUE)
# note: this has tips first, then nodes, so that we re-order it first
all_ancestors_nodes_tips <- matrix(NA, ncol =  nnodes + ntips, nrow = nnodes + ntips)
all_ancestors_nodes_tips[1:nnodes, 1:nnodes] <- all_ancestors[ntips+1:nnodes, ntips+1:nnodes]
all_ancestors_nodes_tips[1:nnodes, nnodes+1:ntips] <- all_ancestors[ntips+1:nnodes, 1:ntips]
all_ancestors_nodes_tips[nnodes+1:ntips, 1:nnodes] <- all_ancestors[1:ntips, ntips+1:nnodes]
all_ancestors_nodes_tips[nnodes+1:ntips, nnodes+1:ntips] <- all_ancestors[1:ntips, 1:ntips]
## Fill it
for (i in 1:(nnodes + ntips)) {
  for (j in 1:(nnodes + ntips)) {
    ancestor_ij <- all_ancestors_nodes_tips[i,j]      # ancestor of i and j
    t_ij <- node.depth.edgelength(tree)[ancestor_ij]  # time between root and ancestor
    Vfull[i,j] <- t_ij                                # fill matrix
  }
}
## separate between nodes and tips
VZZ <- Vfull[1:nnodes, 1:nnodes]
VZY <- Vfull[1:nnodes, nnodes+1:ntips]
VYY <- Vfull[nnodes+1:ntips, nnodes+1:ntips]

We can check that the sub-matrix with the tips is the same as the matrix we previously had:

all(VYY == V)

[1] TRUE

Using the values of $\boldsymbol{\mu}$ and $\mathbf{\Sigma}$ estimated in the previous section, compute the expectation and mean of the full vector $(\mathbf{Z}, \mathbf{Y})$. You can use the Kronecker product as presented below.

## Kronecker product
A <- matrix(1:4, ncol = 2)
B <- matrix(10, ncol = 3, nrow = 3)
A

     [,1] [,2]
[1,]    1    3
[2,]    2    4

     [,1] [,2] [,3]
[1,]   10   10   10
[2,]   10   10   10
[3,]   10   10   10

kronecker(A, B)

     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]   10   10   10   30   30   30
[2,]   10   10   10   30   30   30
[3,]   10   10   10   30   30   30
[4,]   20   20   20   40   40   40
[5,]   20   20   20   40   40   40
[6,]   20   20   20   40   40   40

mZ <- kronecker(t(mu_hat), rep(1, nnodes))
mY <- kronecker(t(mu_hat), rep(1, ntips))
SZZ <- kronecker(sigma_hat, VZZ)
SZY <- kronecker(sigma_hat, VZY)
SYY <- kronecker(sigma_hat, VYY)

For the conditional distribution of the ancestral states knowing the tip positions, remind that we have: \[ \mathbf{Z} \mid \{\mathbf{Y} = \mathbf{y}\} \sim \mathcal{N}\left( \bar{\mathbf{m}} , \bar{\mathbf{S}} \right) \]

\[ \bar{\mathbf{m}} = \mathbf{m}_Z + \mathbf{S}_{ZY} \mathbf{S}_{YY}^{-1} (\mathbf{y} - \mathbf{m}_Y) \quad \bar{\mathbf{S}} = \mathbf{S}_{ZZ} - \mathbf{S}_{ZY} \mathbf{S}_{YY}^{-1}\mathbf{S}_{YZ} \]

For each node $i$, compute the conditional mean of the ancestral position at this node.

## full conditionals
bar_m <- [...]

## re-organize to find the marginals
cond_mean_Z <- [...]

## full conditionals
bar_m <- mZ + SZY %*% solve(SYY) %*% (as.vector(Y) - mY)

## re-organize to find the marginals
cond_mean_Z <- matrix(bar_m, ncol = 2, byrow = FALSE)

Plot in Evolaps

We use evolaps to plot the reconstructe states dynamically.

We first need to export the data using the correct format. This requires package treeio:

if (!requireNamespace("BiocManager", quietly=TRUE))
    install.packages("BiocManager")
BiocManager::install("treeio")

Then record the annotated tree in your local repository:

rec_table <- list(node = seq_len(ntips + nnodes))

## Positions
rec_table[["location1"]] <- c(dat[, "lat"], cond_mean_Z[, 1])
rec_table[["location2"]] <- c(dat[, "long"], cond_mean_Z[, 2])
## Reconstructions
rec_table[["location1_median"]] <- c(rep(NA, ntips), cond_mean_Z[, 1])
rec_table[["location2_median"]] <- c(rep(NA, ntips), cond_mean_Z[, 2])


## treedata
rec_table <- treeio::as_tibble(rec_table)
tree_tibble <- treeio::as_tibble(tree)
tree_data <- treeio::full_join(tree_tibble, rec_table, by = 'node')
tree_data <- treeio::as.treedata(tree_data)

treeio::write.beast(tree_data, file = "tree_MCC.tree", tree.name = "TREE1")

Finally, read the tree in Evolaps.

Ancestral state reconstruction - variances

For each node $i$, compute the conditional variance of the ancestral position at this node.

## full conditionals
bar_S <- [...]

## re-organize to find the marginals
cond_var_Z <- list()
for (i in 1:nnodes) {
  cond_var_Z[[i]] <- [...]
}

## full conditionals
bar_S <- SZZ - SZY %*% solve(SYY) %*% t(SZY)

## re-organize to find the marginals
cond_var_Z <- list()
for (i in 1:nnodes) {
  cond_var_Z[[i]] <- matrix(c(bar_S[i,i], bar_S[i, nnodes+i],
                              bar_S[nnodes+i, i], bar_S[nnodes+i, nnodes+i]),
                            ncol = 2)
}

For each node $i$, compute the confidence ellipse using package ellipse.

level <- 0.80

conf_loc <- list()
for (i in 1:nnodes) {
  conf_loc[[i]] <- ellipse::ellipse([...])
}

level <- 0.80

conf_loc <- list()
for (i in 1:nnodes) {
  conf_loc[[i]] <- ellipse::ellipse(cond_var_Z[[i]],
                                    center = cond_mean_Z[i, ],
                                    level = level)
}

Then record the annotated tree in your local repository:

rec_table <- list(node = seq_len(ntips + nnodes))

## Positions
rec_table[["location1"]] <- c(dat[, "lat"], cond_mean_Z[, 1])
rec_table[["location2"]] <- c(dat[, "long"], cond_mean_Z[, 2])
## Reconstructions
rec_table[["location1_median"]] <- c(rep(NA, ntips), cond_mean_Z[, 1])
rec_table[["location2_median"]] <- c(rep(NA, ntips), cond_mean_Z[, 2])
## ellipses
trait_name_lat <- paste0("location1_", level * 100, "%HPD")
trait_name_long <- paste0("location2_", level * 100, "%HPD")
rec_table[[paste0(trait_name_lat, "_", 1)]] <- c(rep(NA, ntips), lapply(conf_loc, function(cc) cc[, 1]))
rec_table[[paste0(trait_name_long, "_", 1)]] <- c(rep(NA, ntips), lapply(conf_loc, function(cc) cc[, 2]))

## treedata
rec_table <- treeio::as_tibble(rec_table)
tree_tibble <- treeio::as_tibble(tree)
tree_data <- treeio::full_join(tree_tibble, rec_table, by = 'node')
tree_data <- treeio::as.treedata(tree_data)

treeio::write.beast(tree_data, file = "tree_MCC.tree", tree.name = "TREE1")

Finally, read the tree in Evolaps. Are these confidence regions satisfactory ?