Hi folks-
This is following up on a post from last year. When estimating state
probabilities with ace() for discrete characters, you can get
negative probabilities. They still sum to 1 for each internal node,
but this just didn't make sense to me. See example below. Yes, this
is a contrived and simple example, but I have now observed this
behavior with several real datasets. Since the actual state
probabilities as per the Pagel (1994) algorithm should be summations
of probabilities over all descendent nodes, I don't see how these can
be - in principle - negative.
The problem seems to be due to the fact that the rate estimates are
allowed to be negative, as ace() is currently implemented; maybe
something with NaN or -Inf values in the optimization.
Anyway, I modified ace and obtained results that seem to make sense,
by optimizing log-transformed rate parameters, thus enforcing a
constraint that rates are greater than zero. This seems to work.
Also, optimize is supposedly more stable than nlm or optim (or so I
thought) for 1-dimensional optimization, so maybe it should use this
for a 1-rate markov model. As a quick check, I've pasted a modified
version of ace below in case anyone is interested.
I think this *can* change results relative to the old version.
Example of the problem:
library(laser);
data(skinktree);
data <- c(1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2,
1,1, 2, 2);
anc.states <- ace(data, skinktree, type='discrete', model='ER', ip=.1);
# Bad
anc.states$lik.anc;
anc.states$rates
###
# check with new ace, below:
states_new <- ace.discrete(data, skinktree, model='ER', ip=.1);
states_new;
ace.discrete <- function (x, phy, method = "ML", CI = TRUE, model =
"ER", scaled = TRUE, kappa = 1, ip = 0.1)
{
if (class(phy) != "phylo")
stop("object \"phy\" is not of class \"phylo\".")
if (is.null(phy$edge.length))
stop("tree has no branch lengths")
nb.tip <- length(phy$tip.label)
nb.node <- phy$Nnode
if (nb.node != nb.tip - 1)
stop("\"phy\" is not rooted AND fully dichotomous.")
if (length(x) != nb.tip)
stop("length of phenotypic and of phylogenetic data do not
match.")
if (!is.null(names(x))) {
if (all(names(x) %in% phy$tip.label))
x <- x[phy$tip.label]
else warning("the names of argument \"x\" and the tip labels
of the tree\ndid not match: the former were ignored in the analysis.")
}
obj <- list()
if (kappa != 1)
phy$edge.length <- phy$edge.length^kappa
if (method != "ML")
stop("only ML estimation is possible for discrete
characters.")
if (!is.factor(x))
x <- factor(x)
nl <- nlevels(x)
lvls <- levels(x)
x <- as.integer(x)
if (is.character(model)) {
rate <- matrix(NA, nl, nl)
if (model == "ER")
np <- rate[] <- 1
if (model == "ARD") {
np <- nl * (nl - 1)
rate[col(rate) != row(rate)] <- 1:np
}
if (model == "SYM") {
np <- nl * (nl - 1)/2
rate[col(rate) < row(rate)] <- 1:np
rate <- t(rate)
rate[col(rate) < row(rate)] <- 1:np
}
}
else {
if (ncol(model) != nrow(model))
stop("the matrix given as `model' is not square")
if (ncol(model) != nl)
stop("the matrix `model' must have as many rows\nas
the number of categories in `x'")
rate <- model
np <- max(rate)
}
index.matrix <- rate
index.matrix[cbind(1:nl, 1:nl)] <- NA
rate[cbind(1:nl, 1:nl)] <- 0
rate[rate == 0] <- np + 1
liks <- matrix(0, nb.tip + nb.node, nl)
for (i in 1:nb.tip) liks[i, x[i]] <- 1
phy <- reorder(phy, "pruningwise")
Q <- matrix(0, nl, nl)
dev <- function(p, output.liks = FALSE) {
p <- exp(p);
Q[] <- c(p, 0)[rate]
diag(Q) <- -rowSums(Q)
for (i in seq(from = 1, by = 2, length.out = nb.node)) {
j <- i + 1
anc <- phy$edge[i, 1]
des1 <- phy$edge[i, 2]
des2 <- phy$edge[j, 2]
tmp <- eigen(Q * phy$edge.length[i], symmetric = FALSE)
P1 <- tmp$vectors %*% diag(exp(tmp$values)) %*%
solve(tmp$vectors)
tmp <- eigen(Q * phy$edge.length[j], symmetric = FALSE)
P2 <- tmp$vectors %*% diag(exp(tmp$values)) %*%
solve(tmp$vectors)
liks[anc, ] <- P1 %*% liks[des1, ] * P2 %*% liks[des2,
]
}
if (output.liks)
return(liks[-(1:nb.tip), ])
LH <- -2 * log(sum(liks[nb.tip + 1, ]));
return(LH);
}
out <- nlm(function(p) dev(p), p = rep(log(ip), length.out =
np), hessian = TRUE);
obj$rates <- exp(out$estimate);
obj$loglik <- -out$minimum/2;
if (any(out$gradient == 0))
warning("The likelihood gradient seems flat in at least
one dimension (gradient null)\ncannot compute the standard-errors of
the transition rates.\n")
else obj$se <- sqrt(diag(solve(out$hessian)))
obj$index.matrix <- index.matrix
if (CI) {
lik.anc <- dev(log(obj$rates), TRUE)
lik.anc <- lik.anc/rowSums(lik.anc)
colnames(lik.anc) <- lvls
obj$lik.anc <- lik.anc
}
obj$call <- match.call()
class(obj) <- "ace"
obj
}
Dan Rabosky
Department of Ecology and Evolutionary Biology &
Fuller Evolutionary Biology Program
Cornell Lab of Ornithology
Cornell University
Ithaca, NY 14853-2701
DLR32Xcornell.edu (X = @)
ph 607 592 4636
fax 607 255 8088
new website:
http://www.eeb.cornell.edu/Rabosky/dan/main.html
[[alternative HTML version deleted]]
_______________________________________________
R-sig-phylo mailing list
[email protected]
https://stat.ethz.ch/mailman/listinfo/r-sig-phylo