library(ape) # for phylogenetic tree manipulation
library(ggplot2) # for plotting results
library(ggExtra) # for ggMarginal bivariate plot
An Introduction to Phylogeography
Analysis of the West Nile Virus (WNV) data with the Brownian Motion
In this practical script, we will analysis the West Nile Virus (WNV) data from Pybus et al. 2012, assuming a (strict) Brownian Motion model of geographical spread, and a fixed phylogenetic tree.
This script uses R
, with the following packages, that you can install with the command instal.packages
:
The Brownian Motion (BM)
The goal of this first section is to simulate the trajectory of a simple Brownian Motion (BM).
Univariate BM
The univariate BM with unit variance \((W_t)_{t \geq 0}\) is the continuous time process with stationary, independent increments that is such that:
- \(W_0 = 0\);
- \(W_t \mid W_s \sim \mathcal{N}(W_s, t-s)\) for any \(t \geq s \geq 0\).
To simulate a trajectory of this process between \(0\) ant \(t_e\), we:
- choose a regular grid on \([0,t_e]\) with step \(dt = t_e / N\);
- set \(W_0 = 0\);
- simulate increments \(dW_n = W_{nt} - W_{(n-1)t} \sim \mathcal{N}(0, dt)\) for \(1 \leq n \leq N\);
- write \(W_{n} = \sum_{i=1}^n dW_i\) for \(1 \leq n \leq N\).
Justify this construction.
To simulate this process in R
, we use the following code (to be completed).
Write a function simulate_bm(t_e,N)
that simulates a BM on \(N\) points of the interval \([0,t_e]\). The function should return a dataframe, with a column containing the time, and a column containing the corresponding simulated values.
<- function(t_e, N) {
simulate_bm # 1. choose a regular grid on [0,T] with step dt = t_e / N
<- t_e / N
dt <- seq(0, t_e, dt)
time # 2. define data frame and set W_0 = 0
<- data.frame(time = time, value = rep(NA, N+1))
traj
[...]# 3. simulate increments dW_n \sim N(0, dt) for 1 <= n <= N
[...]# 4. write W_n = \sum_{i=1}^n dW_i for 1 <= n <= N in the dataframe
[...]# return dataframe
return(traj)
}
<- function(t_e, N) {
simulate_bm # 1. choose a regular grid on [0,T] with step dt = t_e / N
<- t_e / N
dt <- seq(0, t_e, dt)
time # 2. define data frame and set W_0 = 0
<- data.frame(time = time, value = rep(NA, N+1))
traj $value[1] <- 0
traj# 3. simulate increments dW_n ~ N(0, dt) for 1 <= n <= N
<- rep(NA, N)
dW for (n in 1:N) {
<- rnorm(1, mean = 0, sd = sqrt(dt))
dW[n]
}# 4. write W_n = \sum_{i=1}^n dW_i for 1 <= n <= N in the dataframe
$value[-1] <- cumsum(dW)
traj# return dataframe
return(traj)
}
We can then simulate and plot the trajectory.
<- 10
t_e <- 1000
N <- simulate_bm(t_e, N)
traj ggplot(traj, aes(x = time, y = value)) + geom_line()
How can we simulate a BM with a given variance \(\sigma^2\), and starting at a given value \(\mu\) ?
<- 0.01
sigma2 <- 10.3
mu <- simulate_bm(t_e, N)
traj $value <- mu + sqrt(sigma2) * traj$value
trajggplot(traj, aes(x = time, y = value)) + geom_line()
What is the distribution of \(W_{t_e}\) ? Build an histogram to show it empirically.
<- 1000
nrep <- data.frame(it = 1:nrep, value = rep(NA, nrep))
W_end
for (i in 1:nrep) {
## Fill in value
}
## Plot
ggplot(W_end, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 50) +
stat_function(fun = dnorm, args = c([...], [...]))
<- 1000
nrep <- data.frame(it = 1:nrep, value = rep(NA, nrep))
W_end for (i in 1:nrep) {
$value[i] <- simulate_bm(t_e, N)$value[N]
W_end
}ggplot(W_end, aes(x = value)) +
geom_histogram(aes(y = after_stat(density)), bins = 50) +
stat_function(fun = dnorm, args = c(0, sqrt(t_e)))
Bi-variate BM
The bi-variate BM with unit variance \((\mathbf{W}_t)_{t \geq 0}\) is the continuous time process with stationary, independent increments that is such that:
- \(\mathbf{W}_0 = \mathbf{0}\);
- \(\mathbf{W}_t \mid \mathbf{W}_s \sim \mathcal{N}(\mathbf{W}_s, (t-s) \mathbf{I}_2)\) for any \(t \geq s \geq 0\).
Note that this process is simply made of two independent BM on the \(x\) and \(y\) axes.
Write a function simulate_bm_bi(t_e,N)
that simulates a BM on \(N\) points of the interval \([0,t_e]\). The function should return a dataframe, with a column containing the time, and a column containing the corresponding simulated x and y values.
<- function(t_e, N) {
simulate_bm_bi
[...]return(traj)
}
<- function(t_e, N) {
simulate_bm_bi # simulate x axis
<- simulate_bm(t_e, N)
traj_x # simulate y axis
<- simulate_bm(t_e, N)
traj_y # merge
<- merge(traj_x, traj_y, by = "time")
traj return(traj)
}
We can then simulate and plot the trajectory.
<- simulate_bm_bi(t_e, N)
traj ggplot(traj, aes(x = value.x, y = value.y)) + geom_path()
How can we simulate a BM with a given variance \(\Sigma\), and starting at a given value \(\boldsymbol{\mu}\) ?
<- matrix(c(0.55, 0.45, 0.45, 0.55), ncol = 2)
sigma2 <- chol(sigma2)
chol_sigma2 stopifnot(isTRUE(all.equal(t(chol_sigma2) %*% chol_sigma2, sigma2)))
<- c(10.3, -2.3)
mu <- simulate_bm_bi(t_e, N)
traj c(2, 3)] <- sweep(as.matrix(traj[, c(2, 3)]) %*% chol_sigma2, 2, -mu)
traj[, ggplot(traj, aes(x = value.x, y = value.y)) + geom_path()
What is the distribution of \(\mathbf{W}_{t_e}\) ? Build an histogram to show it empirically.
<- 1000
nrep <- data.frame(it = 1:nrep, value = rep(NA, nrep))
W_end
for (i in 1:nrep) {
## Fill in value
}
## Plot
ggplot(W_end, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 50) +
stat_function(fun = dnorm, args = c([...], [...]))
<- 1000
nrep <- data.frame(it = 1:nrep, value.x = rep(NA, nrep), value.y = rep(NA, nrep))
W_end for (i in 1:nrep) {
<- simulate_bm_bi(t_e, N)
traj c(2, 3)] <- sweep(as.matrix(traj[, c(2, 3)]) %*% chol_sigma2, 2, -mu)
traj[, c(2, 3)] <- traj[N, c(2, 3)]
W_end[i,
}<- ellipse::ellipse(t_e * sigma2, center = mu, level = 0.95)
datEllipse <-
p ggplot(W_end, aes(x = value.x, y = value.y)) +
geom_point() +
geom_point(data = data.frame(x = mu[1], y = mu[2]), aes(x = x, y = y), size = 4, col = "red") +
geom_path(aes(x = x, y = y), data = datEllipse, linewidth = 2, linetype = 2, col = "red") +
coord_fixed(ratio = 1) +
theme_bw()
ggMarginal(p, type = "histogram")
BM on a tree
We can then simulate a BM on a phylogenetic tree.
We first need to simulate a tree using function rphylo
from the ape
package:
set.seed(1899) # set seed for reproducibility
<- rphylo(5, 0.5, 0.1) # simulate a tree
tree # show tree object tree
Phylogenetic tree with 5 tips and 4 internal nodes.
Tip labels:
t1, t2, t3, t4, t5
Rooted; includes branch length(s).
The phylo
object contains all the information needed about the tree (see help(phylo)
), including branch lengths, which we take as given in units of time. We can re-order the tree so that its edges are numbered from the root to the tips (in a preorder).
<- reorder(tree)
tree plot(tree)
edgelabels()
nodelabels()
tiplabels()
The tree$edge
field is a table with 2 columns and as many rows as there are edges. Each line gives the number of the parent node and the child node of the edge.
Give the parent node and child node of edge number 5
. Find it on the tree.
Edge number 3
is the third row of tree$edge
:
$edge[5, ] tree
[1] 7 5
Its parent node is:
$edge[5, 1] tree
[1] 7
and its child node is:
$edge[5, 2] tree
[1] 5
Find the parent node on the tree of node number 3
. Find it on the tree.
To find the parent node of node number 3
, we need to find it in the second column of tree$edge
:
which(tree$edge[, 2] == 3) # find the edge that ends at node 3
[1] 3
The parent of node 3
is then given by the first column of this edge:
$edge[tree$edge[, 2] == 3, 1] # find the parent node of node 3 tree
[1] 9
Find the edge that is just before edge number 5
on the tree. What about the edge that is just before edge number 1
? Find them on the tree.
To find the edge that is just before edge number 3
on the tree, we need to find the edge that ends where edge 3
begins:
which(tree$edge[5, 1] == tree$edge[, 2])
[1] 1
We do the same for edge 1
:
which(tree$edge[1, 1] == tree$edge[, 2])
integer(0)
and find that this edge has no parent edge, as it starts at the root of the tree.
What is the length of edge number 3
?
The length of edge number 3
is given by:
$edge.length[3] tree
[1] 0.1571309
To simulate a BM on the tree, we need to traverse the branches of the tree in a preorder. On a given branch, we can simulate a BM in time. When the branch ends at a given node, it splits into two branches, and we can start two new independent BM, starting at the ending value of the current branch.
Simulate a univariate BM on the tree
generated above by traversing the edges of the tree.
## choose a time step
<- 0.001
dt ## choose starting value
<- 1.3
mu ## Create a data frame with the simulated trajectory
<- data.frame(value = numeric(0), # position of the BM
dftree time = numeric(0), # time elapsed since root
edge = numeric(0)) # number of the edge of the tree
## traverse the tree from edge to edge
for (i in 1:nrow(tree$edge)) {
# simulate a BM starting at (0,0) on the current branch
<- tree$edge.length[i]
t_branch <- floor(t_branch / dt)
N_branch <- [...]
current_BM # find the parent edge of the starting node of the current edge
<-[...]
parent_edge # case where current edge does not a parent edge
if (length(parent_edge) == 0) {
# add starting value mu
$value <- [...]s
current_BMelse { # case where current edge do have a parent edge
} # find last value on the parent branch
<- dftree[dftree$edge == parent_edge, ]
dfparent <- dfparent[nrow(dfparent), ]
last_parent # add last value of parent as starting value of the branch
$value <- [...]
current_BM$time <- [...]
current_BM
}# fill in the dataframe
<- data.frame(value = current_BM$value,
dfbranch time = current_BM$time,
edge = i)
# update dftree
<- rbind(dftree, dfbranch)
dftree }
## choose a time step
<- 0.001
dt ## choose starting value
<- 1.3
mu ## Create a data frame with the simulated trajectory
<- data.frame(value = numeric(0), # position of the BM
dftree time = numeric(0), # time elapsed since root
edge = numeric(0)) # number of the edge of the tree
## traverse the tree from edge to edge
for (i in 1:nrow(tree$edge)) {
# simulate a BM starting at (0,0) on the current branch
<- tree$edge.length[i]
t_branch <- floor(t_branch / dt)
N_branch <- simulate_bm(t_branch, N_branch)
current_BM # find the parent edge of the starting node of the current edge
<- which(tree$edge[i, 1] == tree$edge[, 2])
parent_edge # case where current edge does not a parent edge
if (length(parent_edge) == 0) {
# add starting value mu
$value <- current_BM$value + mu
current_BMelse { # case where current edge do have a parent edge
} # find last value on the parent branch
<- dftree[dftree$edge == parent_edge, ]
dfparent <- dfparent[nrow(dfparent), ]
last_parent # add last value of parent as starting value of the branch
$value <- current_BM$value + last_parent$value
current_BM$time <- current_BM$time + last_parent$time
current_BM
}# fill in the dataframe
<- data.frame(value = current_BM$value,
dfbranch time = current_BM$time,
edge = i)
# update dftree
<- rbind(dftree, dfbranch)
dftree }
We can then plot the univariate BM in time.
<- ggplot(dftree, aes(time, value, color = as.factor(edge))) +
p geom_path(linewidth = 1)
p
Compare the simulation with the tree structure.
We can then simulate a bi-variate BM on the tree.
Using a similar recursion on the tree, simulate a bivariate BM on the tree
generated above.
## choose a time step
<- 0.001
dt ## choose starting value
<- c(1.3, -2.4)
mu ## Create a data frame with the simulated trajectory
<- data.frame(value.x = numeric(0), # x position of the BM
dftree value.y = numeric(0), # y position of the BM
time = numeric(0), # time elapsed since root
edge = numeric(0)) # number of the edge of the tree
## traverse the tree from edge to edge
for (i in 1:nrow(tree$edge)) {
[...] }
## choose a time step
<- 0.001
dt ## choose starting value
<- c(1.3, -2.4)
mu ## Create a data frame with the simulated trajectory
<- data.frame(value.x = numeric(0), # x position of the BM
dftree value.y = numeric(0), # y position of the BM
time = numeric(0), # time elapsed since root
edge = numeric(0)) # number of the edge of the tree
## traverse the tree from edge to edge
for (i in 1:nrow(tree$edge)) {
# simulate a BM starting at (0,0) on the current branch
<- tree$edge.length[i]
t_branch <- floor(t_branch / dt)
N_branch <- simulate_bm_bi(t_branch, N_branch)
current_BM # find the parent edge of the starting node of the current edge
<- which(tree$edge[i, 1] == tree$edge[, 2])
parent_edge # case where current edge does not a parent edge
if (length(parent_edge) == 0) {
# add starting value mu
$value.x <- current_BM$value.x + mu[1]
current_BM$value.y <- current_BM$value.y + mu[2]
current_BMelse { # case where current edge do have a parent edge
} # find last value on the parent branch
<- dftree[dftree$edge == parent_edge, ]
dfparent <- dfparent[nrow(dfparent), ]
last_parent # add last value of parent as starting value of the branch
$value.x <- current_BM$value.x + last_parent$value.x
current_BM$value.y <- current_BM$value.y + last_parent$value.y
current_BM$time <- current_BM$time + last_parent$time
current_BM
}# fill in the dataframe
<- data.frame(value.x = current_BM$value.x,
dfbranch value.y = current_BM$value.y,
time = current_BM$time,
edge = i)
# update dftree
<- rbind(dftree, dfbranch)
dftree }
<- ggplot(dftree, aes(value.x, value.y, color = as.factor(edge))) +
p geom_path(linewidth = 1) +
coord_fixed(ratio = 1)
p
Analysis of the WNV dataset
Loading the data
We use the data available in this GitHub repository.
We assume here that a phylogenetic tree of the virus has already been inferred, based on genetic data. It can be loaded in R
using function read.tree
from package ape
.
<- read.tree("https://raw.githubusercontent.com/pbastide/cauchy_paper/refs/heads/main/data/Pybus_et_al_2012_Data/WNV_GTR_gamma_MCC.newick")
tree tree
Phylogenetic tree with 104 tips and 103 internal nodes.
Tip labels:
AF202541_Hs_1999.66, FJ151394_Cb_1999, DQ164188_Cb_2003, AF404756_Cb_2000, DQ164194_Cb_2001, AF404754_Cp_2000, ...
Rooted; includes branch length(s).
Each tip of the tree represents a variant of the virus sampled in one individual. The tree has time information in years, so that it can be dated, as in the plot below.
plot(tree, show.tip.label = FALSE)
axisPhylo(root.time = 1998.394, backward = FALSE)
We then need to know the geographical localisation of the virus when they were sampled. These are recorded as latitude and longitude information in the following table:
<- read.table("https://raw.githubusercontent.com/pbastide/cauchy_paper/refs/heads/main/data/Pybus_et_al_2012_Data/WNV_lat_long.txt",
dat header = TRUE)
head(dat)
traits lat long
1 WG007_Hs_2005.59 31.82 -106.56
2 WG009_Hs_2005.67 32.28 -106.74
3 WG011_Hs_2006.66 31.78 -106.50
4 WG013_Hs_2007.48 31.84 -106.53
5 WG024_Hs_2003.53 33.81 -117.83
6 WG080_Hs_2004.56 34.22 -118.44
The names in dat$traits
should match the tip labels of the tree in tree$tip.label
. Using function match
, re-order the lines of dat
to make sure they are in the same as the tip labels. Then plot the tree and data using function phydataplot
.
## Match data and tree
match(tree$tip.label, dat$traits)
[...]
## Plot tree and data
plot(tree, show.tip.label = FALSE, x.lim = 25)
phydataplot(dat[, "lat", drop = F], tree, offset = 1, scaling = 0.1)
phydataplot(dat[, "long", drop = F], tree, offset = 15, scaling = 0.05)
## Match data and tree
<- dat[match(tree$tip.label, dat$traits), ]
dat rownames(dat) <- dat$traits
## Plot tree and data
plot(tree, show.tip.label = FALSE, x.lim = 25)
phydataplot(dat[, "lat", drop = F], tree, offset = 1, scaling = 0.1)
phydataplot(dat[, "long", drop = F], tree, offset = 15, scaling = 0.05)
The two traits correspond to latitude and longitude. To plot these points on a map, we can use packages sf
and rnaturalearth
.
library("sf")
Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library("rnaturalearth")
<- ne_countries(scale = "medium",
north_america returnclass = "sf",
continent = "north america")
ggplot(data = north_america) +
geom_sf() +
coord_sf(xlim = c(-140, -60), ylim = c(20, 55), expand = FALSE) +
geom_point(data = dat, aes(x = long, y = lat))
This map represents the position of the sampled viruses, at all times. Our question are:
- where did the virus emerge for the first time ?
- how did it spread on the map ?
Root position
To find the most probable root position, we fit a multivariate BM to the data.
Recall that, using the BM, the matrix of data has the matrix normal distribution: \[ \mathbf{Y} \sim \mathcal{MN}_{n,2}(\mathbf{1}_n\boldsymbol{\mu}^T, \mathbf{V}, \mathbf{\Sigma}) \] with \(\boldsymbol{\mu}\) the vector of the root position, \(\mathbf{\Sigma}\) the variance matrix of the BM, and \(\mathbf{V}\) the matrix given by the tree.
The maximum likelihood formulas for the parameters are then: \[ \hat{\boldsymbol{\mu}}^T = (\mathbf{1}_n^T \mathbf{V}^{-1} \mathbf{1}_n)^{-1} \mathbf{1}_n^T \mathbf{V}^{-1} \mathbf{Y} \] \[ \hat{\mathbf{\Sigma}} = \frac{1}{n-1} (\mathbf{Y} - \mathbf{1}_n\hat{\boldsymbol{\mu}}^T)^T \mathbf{V}^{-1} (\mathbf{Y} - \mathbf{1}_n\hat{\boldsymbol{\mu}}^T) \]
The tree matrix \(\mathbf{V}\) can be obtained from ape
function vcv
:
<- vcv(tree)
V 1:3,1:3] V[
AF202541_Hs_1999.66 FJ151394_Cb_1999 DQ164188_Cb_2003
AF202541_Hs_1999.66 1.266477 0.0000000 0.0000000
FJ151394_Cb_1999 0.000000 1.0295882 0.4775296
DQ164188_Cb_2003 0.000000 0.4775296 4.8759063
What are the interpretations of the coefficients of \(\mathbf{V}\) ? On this tree, can you find the year of the root ? What does this mean for the epidemic ?
\(V_{ij}\) is the time between the root and the most recent ancestor of tips \(i\) and \(j\). On the diagonal, \(V_{ii}\) is the time between the root and the tip \(i\). Because there are dates associated with each tip, we can find the year of the root with a subtraction:
diag(V)[c(1, 8, 12)]
AF202541_Hs_1999.66 AY289214_Hs_2002.62 AF196835_Pc_1999.71
1.266477 4.226477 1.316477
1999.66 - diag(V)[1]
AF202541_Hs_1999.66
1998.394
2002.62 - diag(V)[8]
AY289214_Hs_2002.62
1998.394
1999.71 - diag(V)[12]
AF196835_Pc_1999.71
1998.394
This means that the epidemic first appear in the US in year \(1998.394\).
(Bonus: what does the .394
means ?)
Implement the formulas above to find \(\hat{\boldsymbol{\mu}}\) and \(\hat{\mathbf{\Sigma}}\).
## data
<- as.matrix(dat[, c(2, 3)])
Y <- nrow(Y)
ndata = rep(1, ndata)
ones
<- [...]
Vinv
## mu hat
<- [...]
mu_hat
## Sigma hat
<- [...] sigma_hat
## data
<- as.matrix(dat[, c(2, 3)])
Y <- nrow(Y)
ndata = rep(1, ndata)
ones
<- solve(V)
Vinv
## mu hat
<- solve(t(ones) %*% Vinv %*% ones) %*% t(ones) %*% Vinv %*% Y
mu_hat
## Sigma hat
<- 1/(ndata-1) * t(Y - ones %*% mu_hat) %*% Vinv %*% (Y - ones %*% mu_hat) sigma_hat
Plot the inferred root position on the map. Where did the virus emerge in the US ? Does this make sense ?
## data
ggplot(data = north_america) +
geom_sf() +
coord_sf(xlim = c(-140, -60), ylim = c(20, 55), expand = FALSE) +
geom_point(data = dat, aes(x = long, y = lat)) +
geom_point(data = mu_hat, aes(x = long, y = lat), col = "red")
Ancestral state reconstruction
Writing \(\mathbf{Z}\) the matrix of states at ancestral nodes, remind the formula: \[ \begin{pmatrix} \mathbf{Z}_{\cdot,1}\\ \mathbf{Z}_{\cdot,2}\\ \mathbf{Y}_{\cdot,1}\\ \mathbf{Y}_{\cdot,2} \end{pmatrix} \sim \mathcal{N}\left( \begin{pmatrix} \mu_1\mathbf{1}_m\\ \mu_2\mathbf{1}_m\\ \mu_1\mathbf{1}_n\\ \mu_2\mathbf{1}_n \end{pmatrix} , \begin{pmatrix} \Sigma_{11}\mathbf{V}_{ZZ} & \Sigma_{12}\mathbf{V}_{ZZ} & \Sigma_{11}\mathbf{V}_{ZY} & \Sigma_{12}\mathbf{V}_{ZY}\\ \Sigma_{21}\mathbf{V}_{ZZ} & \Sigma_{22}\mathbf{V}_{ZZ} & \Sigma_{21}\mathbf{V}_{ZY} & \Sigma_{22}\mathbf{V}_{ZY}\\ \Sigma_{11}\mathbf{V}_{YZ} & \Sigma_{12}\mathbf{V}_{YZ} & \Sigma_{11}\mathbf{V}_{YY} & \Sigma_{12}\mathbf{V}_{YY}\\ \Sigma_{21}\mathbf{V}_{YZ} & \Sigma_{22}\mathbf{V}_{YZ} & \Sigma_{21}\mathbf{V}_{YY} & \Sigma_{22}\mathbf{V}_{YY} \end{pmatrix} \right) \]
In the formula above, we need the full \(\mathbf{V}\) matrix, for both the \(\mathbf{Z}\) and the \(\mathbf{Y}\).
We first label all the nodes in the tree:
<- makeNodeLabel(tree)
tree tree
Phylogenetic tree with 104 tips and 103 internal nodes.
Tip labels:
AF202541_Hs_1999.66, FJ151394_Cb_1999, DQ164188_Cb_2003, AF404756_Cb_2000, DQ164194_Cb_2001, AF404754_Cp_2000, ...
Node labels:
Node1, Node2, Node3, Node4, Node5, Node6, ...
Rooted; includes branch length(s).
Then we compute the full matrix. This time, we construct it naively by hand.
## Get the number of tips and internal nodes
<- length(tree$tip.label)
ntips <- length(tree$node.label)
nnodes ## Contruct the void matrix
<- matrix(NA, ncol = nnodes + ntips, nrow = nnodes + ntips)
Vfull ## Name it with node first, then tips
<- c(tree$node.label, tree$tip.label)
tipandnodeslabels colnames(Vfull) <- rownames(Vfull) <- tipandnodeslabels
## get all ancestors for each pair of node / tip
<- mrca(tree, full = TRUE)
all_ancestors # note: this has tips first, then nodes, so that we re-order it first
<- matrix(NA, ncol = nnodes + ntips, nrow = nnodes + ntips)
all_ancestors_nodes_tips 1:nnodes, 1:nnodes] <- all_ancestors[ntips+1:nnodes, ntips+1:nnodes]
all_ancestors_nodes_tips[1:nnodes, nnodes+1:ntips] <- all_ancestors[ntips+1:nnodes, 1:ntips]
all_ancestors_nodes_tips[+1:ntips, 1:nnodes] <- all_ancestors[1:ntips, ntips+1:nnodes]
all_ancestors_nodes_tips[nnodes+1:ntips, nnodes+1:ntips] <- all_ancestors[1:ntips, 1:ntips]
all_ancestors_nodes_tips[nnodes## Fill it
for (i in 1:(nnodes + ntips)) {
for (j in 1:(nnodes + ntips)) {
<- all_ancestors_nodes_tips[i,j] # ancestor of i and j
ancestor_ij <- node.depth.edgelength(tree)[ancestor_ij] # time between root and ancestor
t_ij <- t_ij # fill matrix
Vfull[i,j]
}
}## separate between nodes and tips
<- Vfull[1:nnodes, 1:nnodes]
VZZ <- Vfull[1:nnodes, nnodes+1:ntips]
VZY <- Vfull[nnodes+1:ntips, nnodes+1:ntips] VYY
We can check that the sub-matrix with the tips is the same as the matrix we previously had:
all(VYY == V)
[1] TRUE
Using the values of \(\boldsymbol{\mu}\) and \(\mathbf{\Sigma}\) estimated in the previous section, compute the expectation and mean of the full vector \((\mathbf{Z}, \mathbf{Y})\). You can use the Kronecker product as presented below.
## Kronecker product
<- matrix(1:4, ncol = 2)
A <- matrix(10, ncol = 3, nrow = 3)
B A
[,1] [,2]
[1,] 1 3
[2,] 2 4
B
[,1] [,2] [,3]
[1,] 10 10 10
[2,] 10 10 10
[3,] 10 10 10
kronecker(A, B)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 10 10 10 30 30 30
[2,] 10 10 10 30 30 30
[3,] 10 10 10 30 30 30
[4,] 20 20 20 40 40 40
[5,] 20 20 20 40 40 40
[6,] 20 20 20 40 40 40
<- kronecker(t(mu_hat), rep(1, nnodes))
mZ <- kronecker(t(mu_hat), rep(1, ntips))
mY <- kronecker(sigma_hat, VZZ)
SZZ <- kronecker(sigma_hat, VZY)
SZY <- kronecker(sigma_hat, VYY) SYY
For the conditional distribution of the ancestral states knowing the tip positions, remind that we have: \[ \mathbf{Z} \mid \{\mathbf{Y} = \mathbf{y}\} \sim \mathcal{N}\left( \bar{\mathbf{m}} , \bar{\mathbf{S}} \right) \]
\[ \bar{\mathbf{m}} = \mathbf{m}_Z + \mathbf{S}_{ZY} \mathbf{S}_{YY}^{-1} (\mathbf{y} - \mathbf{m}_Y) \quad \bar{\mathbf{S}} = \mathbf{S}_{ZZ} - \mathbf{S}_{ZY} \mathbf{S}_{YY}^{-1}\mathbf{S}_{YZ} \]
For each node \(i\), compute the conditional mean of the ancestral position at this node.
## full conditionals
<- [...]
bar_m
## re-organize to find the marginals
<- [...] cond_mean_Z
## full conditionals
<- mZ + SZY %*% solve(SYY) %*% (as.vector(Y) - mY)
bar_m
## re-organize to find the marginals
<- matrix(bar_m, ncol = 2, byrow = FALSE) cond_mean_Z
Plot in Evolaps
We use evolaps to plot the reconstructe states dynamically.
We first need to export the data using the correct format. This requires package treeio
:
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
::install("treeio") BiocManager
Then record the annotated tree in your local repository:
<- list(node = seq_len(ntips + nnodes))
rec_table
## Positions
"location1"]] <- c(dat[, "lat"], cond_mean_Z[, 1])
rec_table[["location2"]] <- c(dat[, "long"], cond_mean_Z[, 2])
rec_table[[## Reconstructions
"location1_median"]] <- c(rep(NA, ntips), cond_mean_Z[, 1])
rec_table[["location2_median"]] <- c(rep(NA, ntips), cond_mean_Z[, 2])
rec_table[[
## treedata
<- treeio::as_tibble(rec_table)
rec_table <- treeio::as_tibble(tree)
tree_tibble <- treeio::full_join(tree_tibble, rec_table, by = 'node')
tree_data <- treeio::as.treedata(tree_data)
tree_data
::write.beast(tree_data, file = "tree_MCC.tree", tree.name = "TREE1") treeio
Finally, read the tree in Evolaps.
Ancestral state reconstruction - variances
For each node \(i\), compute the conditional variance of the ancestral position at this node.
## full conditionals
<- [...]
bar_S
## re-organize to find the marginals
<- list()
cond_var_Z for (i in 1:nnodes) {
<- [...]
cond_var_Z[[i]] }
## full conditionals
<- SZZ - SZY %*% solve(SYY) %*% t(SZY)
bar_S
## re-organize to find the marginals
<- list()
cond_var_Z for (i in 1:nnodes) {
<- matrix(c(bar_S[i,i], bar_S[i, nnodes+i],
cond_var_Z[[i]] +i, i], bar_S[nnodes+i, nnodes+i]),
bar_S[nnodesncol = 2)
}
For each node \(i\), compute the confidence ellipse using package ellipse
.
<- 0.80
level
<- list()
conf_loc for (i in 1:nnodes) {
<- ellipse::ellipse([...])
conf_loc[[i]] }
<- 0.80
level
<- list()
conf_loc for (i in 1:nnodes) {
<- ellipse::ellipse(cond_var_Z[[i]],
conf_loc[[i]] center = cond_mean_Z[i, ],
level = level)
}
Then record the annotated tree in your local repository:
<- list(node = seq_len(ntips + nnodes))
rec_table
## Positions
"location1"]] <- c(dat[, "lat"], cond_mean_Z[, 1])
rec_table[["location2"]] <- c(dat[, "long"], cond_mean_Z[, 2])
rec_table[[## Reconstructions
"location1_median"]] <- c(rep(NA, ntips), cond_mean_Z[, 1])
rec_table[["location2_median"]] <- c(rep(NA, ntips), cond_mean_Z[, 2])
rec_table[[## ellipses
<- paste0("location1_", level * 100, "%HPD")
trait_name_lat <- paste0("location2_", level * 100, "%HPD")
trait_name_long paste0(trait_name_lat, "_", 1)]] <- c(rep(NA, ntips), lapply(conf_loc, function(cc) cc[, 1]))
rec_table[[paste0(trait_name_long, "_", 1)]] <- c(rep(NA, ntips), lapply(conf_loc, function(cc) cc[, 2]))
rec_table[[
## treedata
<- treeio::as_tibble(rec_table)
rec_table <- treeio::as_tibble(tree)
tree_tibble <- treeio::full_join(tree_tibble, rec_table, by = 'node')
tree_data <- treeio::as.treedata(tree_data)
tree_data
::write.beast(tree_data, file = "tree_MCC.tree", tree.name = "TREE1") treeio
Finally, read the tree in Evolaps. Are these confidence regions satisfactory ?