pbd_ML demo

This document gives a demonstration how to use the package to obtain a maximum-likelihood estimate of the protracted birth-death speciation model.

First thing is to load the PBD package itself:

rm(list = ls())
library(PBD)

We will also need ape for branching.times:

library(ape)

Here we simulate a tree with known parameters:

seed <- 43
set.seed(seed)
b_1 <- 0.3 # speciation-initiation rate of good species
la_1 <- 0.2 # speciation-completion rate
b_2 <- b_1 # the speciation-initiation rate of incipient species
mu_1 <- 0.1 #  extinction rate of good species
mu_2 <- mu_1 # extinction rate of incipient species 
pars <- c(b_1, la_1, b_2, mu_1, mu_2)
age <- 15 # the age for the simulation 
phylogenies <- pbd_sim(pars = pars, age = age)
plot(phylogenies$recontree)

plot(phylogenies$igtree.extant)
## no colors provided. using the following legend:
##         g         i 
##   "black" "#DF536B"

plot(phylogenies$tree)

names(phylogenies)
##  [1] "tree"           "stree_random"   "stree_oldest"   "stree_youngest"
##  [5] "L"              "sL_random"      "sL_oldest"      "sL_youngest"   
##  [9] "igtree.extinct" "igtree.extant"  "recontree"      "reconL"        
## [13] "L0"

Now we try to recover the parameters by maximum likelihood estimation:

brts <- branching.times(phylogenies$recontree)  # branching times
init_b <- 0.2  # speciation-initiation rate
init_mu_1 <- 0.05  # extinction rate of good species
init_la_1 <- 0.3 # speciation-completion rate
#init_mu_2 <- 0.05  # extinction rate of incipient species  # not used

# The initial values of the parameters that must be optimized
initparsopt <- c(init_b, init_mu_1, init_la_1)

# The extinction rates between incipient and good species are equal
exteq <- TRUE

# The first element of the branching times is the crown age (and not the stem age)
soc <- 2

# Conditioning on non-extinction of the phylogeny
# as I actively selected for a nice phylogeny
cond <- 1

# Give the likelihood of the phylogeny (instead of the likelihood of the branching times)
btorph <- 1

Maximum likelihood estimation can now be performed:

r <- pbd_ML(
  brts = brts,
  initparsopt = initparsopt, 
  exteq = exteq,
  soc = soc, 
  cond = cond,
  btorph = btorph,
  verbose = FALSE
)

The ML parameter estimates are:

knitr::kable(r)
b mu_1 lambda_1 mu_2 loglik df conv
0.2540257 0.1016375 0.2159545 0.1016375 -35.89729 3 0

Comparing the known true value with the recovered values:

loglik_true <- PBD::pbd_loglik(pars, brts = brts)
## Parameters: 0.3, 0.2, 0.3, 0.1, 0.1, Loglikelihood: -36.217571
df <- as.data.frame(r)
df <- rbind(df, c(b_1, mu_1, la_1, mu_2, loglik_true, NA, NA))
row.names(df) <- c("ML", "true")
knitr::kable(df)
b mu_1 lambda_1 mu_2 loglik df conv
ML 0.2540257 0.1016375 0.2159545 0.1016375 -35.89729 3 0
true 0.3000000 0.1000000 0.2000000 0.1000000 -36.21757 NA NA

Ideally, all parameter columns should have the same values.

To test for the uncertainty of our ML estimate, we can do a parametric bootstrap.

The function pbd_bootstrap consists of a few steps:

  1. Do a ML estimate
  2. Run a simulation with those estimates
  3. Perform ML estimation on the simulated data
  4. Go to 2 depending on the setting of endmc.
endmc <- 10 # Sets the number of simulations for the bootstrap

b <- pbd_bootstrap(
  brts = brts,
  initparsopt = initparsopt, 
  exteq = exteq,
  soc = soc, 
  cond = cond,
  btorph = btorph,
  plotltt = FALSE,
  endmc = endmc,
  seed = seed
)
## Finding the maximum likelihood estimates ...
## 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -35.91175 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 0.253010, mu_1: 0.100538, lambda_1: 0.216759, mu_2: 0.100538 
##  Maximum loglikelihood: -35.897291 
##  The expected duration of speciation for these parameters is: 2.769582 
##  The median duration of speciation for these parameters is: 2.147914 
## Bootstrapping ...
## 
## Simulated data set 1 out of 10 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -16.96034 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 0.363903, mu_1: 0.351225, lambda_1: 0.242411, mu_2: 0.351225 
##  Maximum loglikelihood: -16.372242 
##  The expected duration of speciation for these parameters is: 1.891841 
##  The median duration of speciation for these parameters is: 1.419619 
## Simulated data set 2 out of 10 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -28.03035 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 0.454120, mu_1: 0.410801, lambda_1: 0.340717, mu_2: 0.410801 
##  Maximum loglikelihood: -27.160309 
##  The expected duration of speciation for these parameters is: 1.436141 
##  The median duration of speciation for these parameters is: 1.078739 
## Simulated data set 3 out of 10 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -27.31665 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 0.621223, mu_1: 0.553980, lambda_1: 0.203032, mu_2: 0.553980 
##  Maximum loglikelihood: -26.904733 
##  The expected duration of speciation for these parameters is: 1.774255 
##  The median duration of speciation for these parameters is: 1.365207 
## Simulated data set 4 out of 10 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -51.47415 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 82.945566, mu_1: 82.753756, lambda_1: 0.010808, mu_2: 82.753756 
##  Maximum loglikelihood: -50.553779 
##  The expected duration of speciation for these parameters is: 0.761437 
##  The median duration of speciation for these parameters is: 0.613138 
## Simulated data set 5 out of 10 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -19.02915 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 2.202783, mu_1: 2.044696, lambda_1: 0.014633, mu_2: 2.044696 
##  Maximum loglikelihood: -17.632112 
##  The expected duration of speciation for these parameters is: 4.364224 
##  The median duration of speciation for these parameters is: 3.698899 
## Simulated data set 6 out of 10 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -25.20562 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 5.332255, mu_1: 5.146879, lambda_1: 0.011020, mu_2: 5.146879 
##  Maximum loglikelihood: -23.819235 
##  The expected duration of speciation for these parameters is: 3.208571 
##  The median duration of speciation for these parameters is: 2.701873 
## Simulated data set 7 out of 10 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -29.8726 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 1.204262, mu_1: 1.009620, lambda_1: 0.026035, mu_2: 1.009620 
##  Maximum loglikelihood: -29.042784 
##  The expected duration of speciation for these parameters is: 4.475683 
##  The median duration of speciation for these parameters is: 3.839286 
## Simulated data set 8 out of 10 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -38.46278 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 0.450733, mu_1: 0.300506, lambda_1: 0.167621, mu_2: 0.300506 
##  Maximum loglikelihood: -38.335028 
##  The expected duration of speciation for these parameters is: 2.485889 
##  The median duration of speciation for these parameters is: 1.959915 
## Simulated data set 9 out of 10 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -45.99938 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 0.303549, mu_1: 0.000003, lambda_1: 0.037387, mu_2: 0.000003 
##  Maximum loglikelihood: -43.849646 
##  The expected duration of speciation for these parameters is: 7.281806 
##  The median duration of speciation for these parameters is: 6.788475 
## Simulated data set 10 out of 10 
## You are optimizing b mu_1 lambda_1 
## You are fixing nothing 
## Extinction rate of incipient species is exactly the same as for good species.
## The likelihood for the initial parameter values is -97.69378 
## Optimizing the likelihood - this may take a while. 
## 
##  Maximum likelihood parameter estimates: b: 0.201410, mu_1: 0.000000, lambda_1: 0.367815, mu_2: 0.000000 
##  Maximum loglikelihood: -96.800484 
##  The expected duration of speciation for these parameters is: 2.168193 
##  The median duration of speciation for these parameters is: 1.642841
knitr::kable(b[[3]])
ntips b mu_1 lambda_1 mu_2 loglik df conv exp_durspec median_durspec
7 0.3639026 0.3512248 0.2424110 0.3512248 -16.37224 3 0 1.8918410 1.4196191
11 0.4541205 0.4108006 0.3407171 0.4108006 -27.16031 3 0 1.4361412 1.0787386
11 0.6212229 0.5539796 0.2030318 0.5539796 -26.90473 3 0 1.7742546 1.3652072
18 82.9455657 82.7537556 0.0108077 82.7537556 -50.55378 3 0 0.7614369 0.6131381
7 2.2027834 2.0446964 0.0146327 2.0446964 -17.63211 3 0 4.3642239 3.6988992
9 5.3322548 5.1468794 0.0110199 5.1468794 -23.81923 3 0 3.2085710 2.7018728
11 1.2042623 1.0096202 0.0260347 1.0096202 -29.04278 3 0 4.4756832 3.8392859
15 0.4507331 0.3005057 0.1676207 0.3005057 -38.33503 3 0 2.4858885 1.9599147
17 0.3035492 0.0000034 0.0373866 0.0000034 -43.84965 3 0 7.2818064 6.7884754
36 0.2014097 0.0000005 0.3678147 0.0000005 -96.80048 3 0 2.1681935 1.6428409

From the bootstrap analysis, we get

Putting this in a table:

dg <- rbind(df, 
  list(
    b = b[[1]]$b, 
    mu_1 = b[[1]]$mu_1, 
    lambda_1 = b[[1]]$lambda_1, 
    mu_2 = b[[1]]$mu_2,
    loglik = b[[1]]$loglik,
    df = b[[1]]$df,
    conv = b[[1]]$conv
  ),
  list(
    b = b[[3]]$b, 
    mu_1 = b[[3]]$mu_1, 
    lambda_1 = b[[3]]$lambda_1, 
    mu_2 = b[[3]]$mu_2,
    loglik = b[[3]]$loglik,
    df = b[[3]]$df,
    conv = b[[3]]$conv
  )
)
dg
##               b         mu_1   lambda_1         mu_2    loglik df conv
## ML    0.2540257 1.016375e-01 0.21595455 1.016375e-01 -35.89729  3    0
## true  0.3000000 1.000000e-01 0.20000000 1.000000e-01 -36.21757 NA   NA
## 1     0.2530101 1.005376e-01 0.21675921 1.005376e-01 -35.89729  3    0
## 11    0.3639026 3.512248e-01 0.24241103 3.512248e-01 -16.37224  3    0
## 2     0.4541205 4.108006e-01 0.34071706 4.108006e-01 -27.16031  3    0
## 3     0.6212229 5.539796e-01 0.20303180 5.539796e-01 -26.90473  3    0
## 4    82.9455657 8.275376e+01 0.01080774 8.275376e+01 -50.55378  3    0
## 5     2.2027834 2.044696e+00 0.01463266 2.044696e+00 -17.63211  3    0
## 6     5.3322548 5.146879e+00 0.01101991 5.146879e+00 -23.81923  3    0
## 7     1.2042623 1.009620e+00 0.02603469 1.009620e+00 -29.04278  3    0
## 8     0.4507331 3.005057e-01 0.16762072 3.005057e-01 -38.33503  3    0
## 9     0.3035492 3.425026e-06 0.03738658 3.425026e-06 -43.84965  3    0
## 10    0.2014097 4.663522e-07 0.36781474 4.663522e-07 -96.80048  3    0
row.names(dg) <- c("ML", "true", "ML2", paste("BS", 1:endmc, sep = ""))
knitr::kable(dg)
b mu_1 lambda_1 mu_2 loglik df conv
ML 0.2540257 0.1016375 0.2159545 0.1016375 -35.89729 3 0
true 0.3000000 0.1000000 0.2000000 0.1000000 -36.21757 NA NA
ML2 0.2530101 0.1005376 0.2167592 0.1005376 -35.89729 3 0
BS1 0.3639026 0.3512248 0.2424110 0.3512248 -16.37224 3 0
BS2 0.4541205 0.4108006 0.3407171 0.4108006 -27.16031 3 0
BS3 0.6212229 0.5539796 0.2030318 0.5539796 -26.90473 3 0
BS4 82.9455657 82.7537556 0.0108077 82.7537556 -50.55378 3 0
BS5 2.2027834 2.0446964 0.0146327 2.0446964 -17.63211 3 0
BS6 5.3322548 5.1468794 0.0110199 5.1468794 -23.81923 3 0
BS7 1.2042623 1.0096202 0.0260347 1.0096202 -29.04278 3 0
BS8 0.4507331 0.3005057 0.1676207 0.3005057 -38.33503 3 0
BS9 0.3035492 0.0000034 0.0373866 0.0000034 -43.84965 3 0
BS10 0.2014097 0.0000005 0.3678147 0.0000005 -96.80048 3 0

We expect rows ML and ML2 to be identical. Their values are indeed very similar.

We can calculate the loglikelihood for

ml_b <- b[[1]]$b
ml_mu_1 <- b[[1]]$mu_1
ml_la_1 <- b[[1]]$lambda_1
ml_mu_2 <- b[[1]]$mu_2
ml_pars1 <- c(ml_b, ml_mu_1, ml_la_1, ml_mu_2)
ml_pars2 <- c(cond, btorph, soc, 0, "lsoda")

l <- pbd_loglik(
  pars1 = ml_pars1,
  pars2 = ml_pars2,
  brts = brts
)
print(l)
## [1] -35.89729
# Create .md, .html, and .pdf files
setwd(paste(getwd(), "vignettes", sep = "/"))
knit("PBD_ML_demo.Rmd")
markdown::markdownToHTML('PBD_ML_demo.md', 'PBD_ML_demo.html', options=c("use_xhml"))
system("pandoc -s PBD_ML_demo.html -o PBD_ML_demo.pdf")