Hello dear colleagues, I use often ape::dist.topo (see here dist.topo.r), which is doing the calculations sequentially, which is very slow for large data sets. I'm sorry, I haven't found any relevant Git repository or so, so I hope Emmanuel won't mind if I discuss it here. I discussed various options with ChatGPT and dist.topo.par1.r is the simplest solution, basically using mc.lapply instead of 2 for loops. Good study material for how to do it in general. Little enhancements are in dist.topo.par2.r, which should be slightly better in case some pair of comparisons would return NA or so, but from my tests there doesn't seem to be any difference. And finally there is dist.topo.par3.r which doesn't load parallel (and uses plain lapply) for cores==1, while parallel and doParallel for multiple cores. It also contains some checks and error handling. From my testing it works well. I'm not sure if tryCatch is really needed there. In any case, improvements welcomed. :-) So, what do You think? Is this usable improvement of ape::dist.topo? Sincerely, V.
-- Vojtěch Zeisek https://trapa.cz/en/ Department of Botany, Faculty of Science Charles University, Prague, Czech Republic https://www.natur.cuni.cz/biology/botany/ https://lab-allience.natur.cuni.cz/ Institute of Botany, Czech Academy of Sciences Průhonice, Czech Republic https://www.ibot.cas.cz/en/ Computing cluster https://sorbus.ibot.cas.cz/en/start
function (x, y = NULL, method = "PH85") { method <- match.arg(method, c("PH85", "score")) if (!is.null(y)) x <- c(x, y) testroot <- any(is.rooted(x)) n <- length(x) res <- numeric(n * (n - 1)/2) nms <- names(x) if (is.null(nms)) nms <- paste0("tree", 1:n) if (method == "PH85") { if (testroot) warning("Some trees were rooted: topological distances may be spurious.") x <- .compressTipLabel(x) ntip <- length(attr(x, "TipLabel")) nnode <- sapply(x, Nnode) foo <- function(phy, ntip) { phy <- reorder(phy, "postorder") pp <- bipartition2(phy$edge, ntip) attr(pp, "labels") <- phy$tip.label ans <- SHORTwise(pp) sapply(ans, paste, collapse = "\r") } x <- lapply(x, foo, ntip = ntip) k <- 0L for (i in 1:(n - 1)) { y <- x[[i]] m1 <- nnode[i] for (j in (i + 1):n) { z <- x[[j]] k <- k + 1L res[k] <- m1 + nnode[j] - 2 * sum(z %in% y) } } } else { k <- 0L for (i in 1:(n - 1)) { for (j in (i + 1):n) { k <- k + 1L res[k] <- .dist.topo.score(x[[i]], x[[j]]) } } } attr(res, "Size") <- n attr(res, "Labels") <- nms attr(res, "Diag") <- attr(res, "Upper") <- FALSE attr(res, "method") <- method class(res) <- "dist" res }
function (x, y = NULL, method = "PH85", cores = detectCores()) { method <- match.arg(method, c("PH85", "score")) if (!is.null(y)) x <- c(x, y) testroot <- any(is.rooted(x)) n <- length(x) res <- numeric(n * (n - 1)/2) nms <- names(x) if (is.null(nms)) nms <- paste0("tree", 1:n) if (cores == 1) { if (method == "PH85") { if (testroot) warning("Some trees were rooted: topological distances may be spurious.") x <- .compressTipLabel(x) ntip <- length(attr(x, "TipLabel")) nnode <- sapply(x, Nnode) foo <- function(phy, ntip) { phy <- reorder(phy, "postorder") pp <- bipartition2(phy$edge, ntip) attr(pp, "labels") <- phy$tip.label ans <- SHORTwise(pp) sapply(ans, paste, collapse = "\r") } x <- lapply(x, foo, ntip = ntip) res_list <- lapply(1:(n - 1), function(i) { y <- x[[i]] m1 <- nnode[i] res_sub <- numeric(n - i) for (j in (i + 1):n) { z <- x[[j]] res_sub[j - i] <- m1 + nnode[j] - 2 * sum(z %in% y) } res_sub }) } else { res_list <- lapply(1:(n - 1), function(i) { res_sub <- numeric(n - i) for (j in (i + 1):n) { res_sub[j - i] <- .dist.topo.score(x[[i]], x[[j]]) } res_sub }) } } else { tryCatch({ cl <- makeCluster(cores) registerDoParallel(cl) library(parallel) library(doParallel) cl <- makeCluster(cores) registerDoParallel(cl) }, error = function(e) { message("Failed to initiate cluster: ", conditionMessage(e)) stop(e) }) if (method == "PH85") { if (testroot) warning("Some trees were rooted: topological distances may be spurious.") x <- .compressTipLabel(x) ntip <- length(attr(x, "TipLabel")) nnode <- sapply(x, Nnode) foo <- function(phy, ntip) { phy <- reorder(phy, "postorder") pp <- bipartition2(phy$edge, ntip) attr(pp, "labels") <- phy$tip.label ans <- SHORTwise(pp) sapply(ans, paste, collapse = "\r") } x <- lapply(x, foo, ntip = ntip) res_list <- mclapply(1:(n - 1), function(i) { y <- x[[i]] m1 <- nnode[i] res_sub <- numeric(n - i) for (j in (i + 1):n) { z <- x[[j]] res_sub[j - i] <- tryCatch(m1 + nnode[j] - 2 * sum(z %in% y), error = function(e) NA) } res_sub }, mc.cores = cores) } else { res_list <- mclapply(1:(n - 1), function(i) { res_sub <- numeric(n - i) for (j in (i + 1):n) { res_sub[j - i] <- tryCatch(.dist.topo.score(x[[i]], x[[j]]), error = function(e) NA) } res_sub }, mc.cores = cores) } stopCluster(cl) registerDoSEQ() } res <- unlist(res_list) if (length(res) == 0) { res <- numeric(n * (n - 1)/2) } attr(res, "Size") <- n attr(res, "Labels") <- nms attr(res, "Diag") <- attr(res, "Upper") <- FALSE attr(res, "method") <- method class(res) <- "dist" res <- as.dist(res) res }
function (x, y = NULL, method = "PH85", cores = detectCores()) { library(parallel) method <- match.arg(method, c("PH85", "score")) if (!is.null(y)) x <- c(x, y) testroot <- any(is.rooted(x)) n <- length(x) res <- numeric(n * (n - 1)/2) nms <- names(x) if (is.null(nms)) nms <- paste0("tree", 1:n) if (method == "PH85") { if (testroot) warning("Some trees were rooted: topological distances may be spurious.") x <- .compressTipLabel(x) ntip <- length(attr(x, "TipLabel")) nnode <- sapply(x, Nnode) foo <- function(phy, ntip) { phy <- reorder(phy, "postorder") pp <- bipartition2(phy$edge, ntip) attr(pp, "labels") <- phy$tip.label ans <- SHORTwise(pp) sapply(ans, paste, collapse = "\r") } x <- lapply(x, foo, ntip = ntip) idx <- combn(n, 2) res <- mclapply(seq_len(ncol(idx)), function(i) { y <- x[[idx[1,i]]] z <- x[[idx[2,i]]] m1 <- nnode[idx[1,i]] m2 <- nnode[idx[2,i]] m1 + m2 - 2 * sum(z %in% y) }, mc.cores = cores) res <- matrix(unlist(res), nrow = n, ncol = n, byrow = TRUE) } else { idx <- combn(n, 2) res <- mclapply(seq_len(ncol(idx)), function(i) { .dist.topo.score(x[[idx[1,i]]], x[[idx[2,i]]]) }, mc.cores = cores) res <- matrix(unlist(res), nrow = n, ncol = n, byrow = TRUE) } attr(res, "Size") <- n attr(res, "Labels") <- nms attr(res, "Diag") <- attr(res, "Upper") <- FALSE attr(res, "method") <- method class(res) <- "dist" res }
function (x, y = NULL, method = "PH85", cores = detectCores()) { library(parallel) method <- match.arg(method, c("PH85", "score")) if (!is.null(y)) x <- c(x, y) testroot <- any(is.rooted(x)) n <- length(x) res <- numeric(n * (n - 1)/2) nms <- names(x) if (is.null(nms)) nms <- paste0("tree", 1:n) if (method == "PH85") { if (testroot) warning("Some trees were rooted: topological distances may be spurious.") x <- .compressTipLabel(x) ntip <- length(attr(x, "TipLabel")) nnode <- sapply(x, Nnode) foo <- function(phy, ntip) { phy <- reorder(phy, "postorder") pp <- bipartition2(phy$edge, ntip) attr(pp, "labels") <- phy$tip.label ans <- SHORTwise(pp) sapply(ans, paste, collapse = "\r") } x <- lapply(x, foo, ntip = ntip) res_list <- mclapply(1:(n - 1), function(i) { y <- x[[i]] m1 <- nnode[i] res_sub <- numeric(n - i) for (j in (i + 1):n) { z <- x[[j]] res_sub[j - i] <- m1 + nnode[j] - 2 * sum(z %in% y) } res_sub }, mc.cores = cores) } else { res_list <- mclapply(1:(n - 1), function(i) { res_sub <- numeric(n - i) for (j in (i + 1):n) { res_sub[j - i] <- .dist.topo.score(x[[i]], x[[j]]) } res_sub }, mc.cores = cores) } res <- unlist(res_list) if (length(res) == 0) { res <- numeric(n * (n - 1)/2) } attr(res, "Size") <- n attr(res, "Labels") <- nms attr(res, "Diag") <- attr(res, "Upper") <- FALSE attr(res, "method") <- method class(res) <- "dist" res <- as.dist(res) res }
signature.asc
Description: This is a digitally signed message part.
_______________________________________________ R-sig-phylo mailing list - R-sig-phylo@r-project.org https://stat.ethz.ch/mailman/listinfo/r-sig-phylo Searchable archive at http://www.mail-archive.com/r-sig-phylo@r-project.org/