Hello dear colleagues,
I use often ape::dist.topo (see here dist.topo.r), which is doing the 
calculations sequentially, which is very slow for large data sets. I'm sorry, 
I haven't found any relevant Git repository or so, so I hope Emmanuel won't 
mind if I discuss it here.
I discussed various options with ChatGPT and dist.topo.par1.r is the simplest 
solution, basically using mc.lapply instead of 2 for loops. Good study 
material for how to do it in general. Little enhancements are in 
dist.topo.par2.r, which should be slightly better in case some pair of 
comparisons would return NA or so, but from my tests there doesn't seem to be 
any difference.
And finally there is dist.topo.par3.r which doesn't load parallel (and uses 
plain lapply) for cores==1, while parallel and doParallel for multiple cores. 
It also contains some checks and error handling. From my testing it works 
well. I'm not sure if tryCatch is really needed there. In any case, 
improvements welcomed. :-)
So, what do You think? Is this usable improvement of ape::dist.topo?
Sincerely,
V.

-- 
Vojtěch Zeisek
https://trapa.cz/en/

Department of Botany, Faculty of Science
Charles University, Prague, Czech Republic
https://www.natur.cuni.cz/biology/botany/
https://lab-allience.natur.cuni.cz/

Institute of Botany, Czech Academy of Sciences
Průhonice, Czech Republic
https://www.ibot.cas.cz/en/
Computing cluster
https://sorbus.ibot.cas.cz/en/start
function (x, y = NULL, method = "PH85") 
{
	method <- match.arg(method, c("PH85", "score"))
	if (!is.null(y)) x <- c(x, y)
	testroot <- any(is.rooted(x))
	n <- length(x)
	res <- numeric(n * (n - 1)/2)
	nms <- names(x)
	if (is.null(nms)) nms <- paste0("tree", 1:n)
	if (method == "PH85") {
			if (testroot) warning("Some trees were rooted: topological distances may be spurious.")
			x <- .compressTipLabel(x)
			ntip <- length(attr(x, "TipLabel"))
			nnode <- sapply(x, Nnode)
			foo <- function(phy, ntip) {
					phy <- reorder(phy, "postorder")
					pp <- bipartition2(phy$edge, ntip)
					attr(pp, "labels") <- phy$tip.label
					ans <- SHORTwise(pp)
					sapply(ans, paste, collapse = "\r")
			}
			x <- lapply(x, foo, ntip = ntip)
			k <- 0L
			for (i in 1:(n - 1)) {
					y <- x[[i]]
					m1 <- nnode[i]
					for (j in (i + 1):n) {
							z <- x[[j]]
							k <- k + 1L
							res[k] <- m1 + nnode[j] - 2 * sum(z %in% y)
					}
			}
	}
	else {
			k <- 0L
			for (i in 1:(n - 1)) {
					for (j in (i + 1):n) {
							k <- k + 1L
							res[k] <- .dist.topo.score(x[[i]], x[[j]])
					}
			}
	}
	attr(res, "Size") <- n
	attr(res, "Labels") <- nms
	attr(res, "Diag") <- attr(res, "Upper") <- FALSE
	attr(res, "method") <- method
	class(res) <- "dist"
	res
}
function (x, y = NULL, method = "PH85", cores = detectCores())
{
  method <- match.arg(method, c("PH85", "score"))
  if (!is.null(y)) x <- c(x, y)
  testroot <- any(is.rooted(x))
  n <- length(x)
  res <- numeric(n * (n - 1)/2)
  nms <- names(x)
  if (is.null(nms)) nms <- paste0("tree", 1:n)
  if (cores == 1) {
    if (method == "PH85") {
      if (testroot) warning("Some trees were rooted: topological distances may be spurious.")
      x <- .compressTipLabel(x)
      ntip <- length(attr(x, "TipLabel"))
      nnode <- sapply(x, Nnode)
      foo <- function(phy, ntip) {
        phy <- reorder(phy, "postorder")
        pp <- bipartition2(phy$edge, ntip)
        attr(pp, "labels") <- phy$tip.label
        ans <- SHORTwise(pp)
        sapply(ans, paste, collapse = "\r")
      }
      x <- lapply(x, foo, ntip = ntip)
      res_list <- lapply(1:(n - 1), function(i) {
        y <- x[[i]]
        m1 <- nnode[i]
        res_sub <- numeric(n - i)
        for (j in (i + 1):n) {
          z <- x[[j]]
          res_sub[j - i] <- m1 + nnode[j] - 2 * sum(z %in% y)
        }
        res_sub
      })
    } else {
      res_list <- lapply(1:(n - 1), function(i) {
        res_sub <- numeric(n - i)
        for (j in (i + 1):n) {
          res_sub[j - i] <- .dist.topo.score(x[[i]], x[[j]])
        }
        res_sub
      })
    }
  } else {
    tryCatch({
      cl <- makeCluster(cores)
      registerDoParallel(cl)
      library(parallel)
      library(doParallel)
      cl <- makeCluster(cores)
      registerDoParallel(cl)
    }, error = function(e) {
     message("Failed to initiate cluster: ", conditionMessage(e))
     stop(e)
     })
    if (method == "PH85") {
      if (testroot) warning("Some trees were rooted: topological distances may be spurious.")
      x <- .compressTipLabel(x)
      ntip <- length(attr(x, "TipLabel"))
      nnode <- sapply(x, Nnode)
      foo <- function(phy, ntip) {
        phy <- reorder(phy, "postorder")
        pp <- bipartition2(phy$edge, ntip)
        attr(pp, "labels") <- phy$tip.label
        ans <- SHORTwise(pp)
        sapply(ans, paste, collapse = "\r")
      }
      x <- lapply(x, foo, ntip = ntip)
      res_list <- mclapply(1:(n - 1), function(i) {
        y <- x[[i]]
        m1 <- nnode[i]
        res_sub <- numeric(n - i)
        for (j in (i + 1):n) {
          z <- x[[j]]
          res_sub[j - i] <- tryCatch(m1 + nnode[j] - 2 * sum(z %in% y),
                                 error = function(e) NA)
        }
        res_sub
      }, mc.cores = cores)
    } else {
      res_list <- mclapply(1:(n - 1), function(i) {
        res_sub <- numeric(n - i)
        for (j in (i + 1):n) {
          res_sub[j - i] <- tryCatch(.dist.topo.score(x[[i]], x[[j]]),
                                 error = function(e) NA)
        }
        res_sub
      }, mc.cores = cores)
    }
    stopCluster(cl)
    registerDoSEQ()
  }
  res <- unlist(res_list)
  if (length(res) == 0) { res <- numeric(n * (n - 1)/2) }
  attr(res, "Size") <- n
  attr(res, "Labels") <- nms
  attr(res, "Diag") <- attr(res, "Upper") <- FALSE
  attr(res, "method") <- method
  class(res) <- "dist"
  res <- as.dist(res)
  res
}
function (x, y = NULL, method = "PH85", cores = detectCores())
{
	library(parallel)
	method <- match.arg(method, c("PH85", "score"))
	if (!is.null(y)) x <- c(x, y)
	testroot <- any(is.rooted(x))
	n <- length(x)
	res <- numeric(n * (n - 1)/2)
	nms <- names(x)
	if (is.null(nms)) nms <- paste0("tree", 1:n)
	if (method == "PH85") {
		if (testroot) warning("Some trees were rooted: topological distances may be spurious.")
		x <- .compressTipLabel(x)
		ntip <- length(attr(x, "TipLabel"))
		nnode <- sapply(x, Nnode)
		foo <- function(phy, ntip) {
			phy <- reorder(phy, "postorder")
			pp <- bipartition2(phy$edge, ntip)
			attr(pp, "labels") <- phy$tip.label
			ans <- SHORTwise(pp)
			sapply(ans, paste, collapse = "\r")
		}
		x <- lapply(x, foo, ntip = ntip)
		idx <- combn(n, 2)
		res <- mclapply(seq_len(ncol(idx)), function(i) {
			y <- x[[idx[1,i]]]
			z <- x[[idx[2,i]]]
			m1 <- nnode[idx[1,i]]
			m2 <- nnode[idx[2,i]]
			m1 + m2 - 2 * sum(z %in% y)
		}, mc.cores = cores)
		res <- matrix(unlist(res), nrow = n, ncol = n, byrow = TRUE)
	}
	else {
		idx <- combn(n, 2)
		res <- mclapply(seq_len(ncol(idx)), function(i) {
			.dist.topo.score(x[[idx[1,i]]], x[[idx[2,i]]])
		}, mc.cores = cores)
		res <- matrix(unlist(res), nrow = n, ncol = n, byrow = TRUE)
	}
	attr(res, "Size") <- n
	attr(res, "Labels") <- nms
	attr(res, "Diag") <- attr(res, "Upper") <- FALSE
	attr(res, "method") <- method
	class(res) <- "dist"
	res
}
function (x, y = NULL, method = "PH85", cores = detectCores())
{
  library(parallel)
  method <- match.arg(method, c("PH85", "score"))
  if (!is.null(y)) x <- c(x, y)
  testroot <- any(is.rooted(x))
  n <- length(x)
  res <- numeric(n * (n - 1)/2)
  nms <- names(x)
  if (is.null(nms)) nms <- paste0("tree", 1:n)
  if (method == "PH85") {
    if (testroot) warning("Some trees were rooted: topological distances may be spurious.")
    x <- .compressTipLabel(x)
    ntip <- length(attr(x, "TipLabel"))
    nnode <- sapply(x, Nnode)
    foo <- function(phy, ntip) {
      phy <- reorder(phy, "postorder")
      pp <- bipartition2(phy$edge, ntip)
      attr(pp, "labels") <- phy$tip.label
      ans <- SHORTwise(pp)
      sapply(ans, paste, collapse = "\r")
    }
    x <- lapply(x, foo, ntip = ntip)
    res_list <- mclapply(1:(n - 1), function(i) {
      y <- x[[i]]
      m1 <- nnode[i]
      res_sub <- numeric(n - i)
      for (j in (i + 1):n) {
        z <- x[[j]]
        res_sub[j - i] <- m1 + nnode[j] - 2 * sum(z %in% y)
      }
      res_sub
    }, mc.cores = cores)
  } else {
    res_list <- mclapply(1:(n - 1), function(i) {
      res_sub <- numeric(n - i)
      for (j in (i + 1):n) {
        res_sub[j - i] <- .dist.topo.score(x[[i]], x[[j]])
      }
      res_sub
    }, mc.cores = cores)
  }
  res <- unlist(res_list)
  if (length(res) == 0) { res <- numeric(n * (n - 1)/2) }
  attr(res, "Size") <- n
  attr(res, "Labels") <- nms
  attr(res, "Diag") <- attr(res, "Upper") <- FALSE
  attr(res, "method") <- method
  class(res) <- "dist"
  res <- as.dist(res)
  res
}

Attachment: signature.asc
Description: This is a digitally signed message part.

_______________________________________________
R-sig-phylo mailing list - R-sig-phylo@r-project.org
https://stat.ethz.ch/mailman/listinfo/r-sig-phylo
Searchable archive at http://www.mail-archive.com/r-sig-phylo@r-project.org/

Reply via email to