Hi Joseph & Co, don't trust that shallowest node has lowest node id. That is a byproduct by functions like rtree() or read.tree(), but must not be generally true. E.g. after tree rearrangements. There is nothing wrong with this tree:
library(ape) tree <- structure(list(edge = structure(c(6L, 6L, 8L, 7L, 7L, 8L, 6L, 1L, 8L, 7L, 2L, 3L, 4L, 5L), .Dim = c(7L, 2L)), tip.label = c("t1", "t2", "t3", "t4", "t5"), Nnode = 3L), .Names = c("edge", "tip.label", "Nnode"), class = "phylo", order = "cladewise") plot(tree) nodelabels() getMRCA_new(tree, c(2:4)) [1] 7 So here is my last version: getMRCA <- function(phy, tip) { if (!inherits(phy, "phylo")) stop('object "phy" is not of class "phylo"') if (length(tip) < 2) return(NULL) Ntip <- length(phy$tip.label) rootnd <- Ntip + 1L pars <- integer(phy$Nnode) # worst case assignment, usually far too long tnd <- if (is.character(tip)) match(tip, phy$tip.label) else tip done_v <- logical(max(phy$edge)) ## build a lookup table to get parents faster pvec <- integer(Ntip + phy$Nnode) pvec[phy$edge[, 2]] <- phy$edge[, 1] ## get entire lineage for first tip nd <- tnd[1] for (k in 1:phy$Nnode) { nd <- pvec[nd] pars[k] <- nd if (nd == rootnd) break } pars <- pars[1:k] # delete the rest mrcind <- integer(max(pars)) mrcind[pars] <- 1:k mrcand <- pars[1] ## traverse lineages for remaining tips, stop if hit common ancestor for (i in 2:length(tnd)) { cnd <- tnd[i] done <- done_v[cnd] while(!done){ done_v[cnd] <- TRUE cpar <- pvec[cnd] # get immediate parent done <- done_v[cpar] # early exit if TRUE if (cpar %in% pars) { if (cpar == rootnd) return(rootnd) # early exit if(mrcind[cpar] > mrcind[mrcand]) mrcand <- cpar done_v[cpar] <- TRUE done <- TRUE } cnd <- cpar # keep going! } } mrcand } It exists also early if an internal node was traversed before hand, so the following code is a bit faster now. library(ape) tree <- stree(2^17, "balanced") system.time(getMRCA(tree, 1:2^16)) Cheers, Klaus On Sat, Jun 10, 2017 at 1:01 PM, Emmanuel Paradis <emmanuel.para...@ird.fr> wrote: > Joseph, > > Really cool. I updated the code in ape. > > About (byte-)compiling, this is usually done during installing the package: > > http://ape-package.ird.fr/ape_installation.html#linux > > I guess Win and Mac versions from CRAN are byte-compiled. To check if a > function is compiled, just print it in R, it will show the bytecode at the > end of the display: > > R> rtree > .... > <bytecode: 0x41a1548> > <environment: namespace:ape> > > Besides, the NEWS for R 3.4.0 says: > > "The JIT (‘Just In Time’) byte-code compiler is now enabled by default at > its level 3. This means functions will be compiled on first or second use > and top-level loops will be compiled and then run. (Thanks to Tomas > Kalibera for extensive work to make this possible.)" This seems to be effective: > > R> foo <- function(x) x > R> foo(1) > [1] 1 > R> foo > function(x) x > R> foo(1) > [1] 1 > R> foo > function(x) x > <bytecode: 0x604bda8> > > Best, > > Emmanuel > > > Le 10/06/2017 à 14:29, Joseph W. Brown a écrit : > >> Emmanuel and Klaus, >> >> After Klaus reminded me that `which` was so inefficient I realized using >> it in finding the match in the first lineage is unnecessary. Instead, we >> can capitalize on the fact that shallower nodes have smaller nodeids. So I >> threw out the vector index `mrcaind` altogether: >> >> getMRCA_new<- function(phy, tip) { >> if (!inherits(phy, "phylo")) >> stop('object "phy" is not of class "phylo"') >> if (length(tip) < 2) return(NULL) >> Ntip <- length(phy$tip.label) >> rootnd <- Ntip + 1L >> >> pars<- integer(phy$Nnode) # worst case assignment, usually far too long >> tnd <- if (is.character(tip)) match(tip, phy$tip.label) else tip >> >> ## build a lookup table to get parents faster >> pvec <- integer(Ntip + phy$Nnode) >> pvec[phy$edge[, 2]] <- phy$edge[, 1] >> >> ## get entire lineage for first tip >> nd <- tnd[1] >> for (k in 1:phy$Nnode) { >> nd <- pvec[nd] >> pars[k] <- nd >> if (nd == rootnd) break >> } >> pars<- pars[1:k] # delete the rest >> mrcand <- pars[1]; # keep track of actual node rather than index in pars >> >> >> ## traverse lineages for remaining tips, stop if hit common ancestor >> for (i in 2:length(tnd)) { >> done <- FALSE >> cnd <- tnd[i] >> while (!done) { >> cpar<- pvec[cnd] # get immediate parent >> if (cpar %in% pars) { >> if (cpar == rootnd) return(rootnd) # early exit >> mrcand <- min(mrcand,cpar); # shallowest node has lowest nodeid >> >> done <- TRUE >> } >> cnd <- cpar # keep going! >> } >> } >> mrcand >> } >> >> I also played around with compiling the function: >> >> library(compiler); >> getMRCA_em_cmp<- cmpfun(getMRCA_em); >> getMRCA_new_cmp<- cmpfun(getMRCA_new); >> >> Here are some timings (*_em is your code, *_new is the code above). For >> my original example (100K tips, 500 tips in MRCA query) there is not a >> whole lot of difference: >> >> microbenchmark(getMRCA_em(phy, tips), getMRCA_new(phy, tips), >> getMRCA_em_cmp(phy, tips), getMRCA_new_cmp(phy, tips)); >> Unit: milliseconds >> expr min lq mean median uq max neval >> getMRCA_em(phy, tips) 23.9987929.8196433.6743732.391 >> 4935.61888136.34521100 >> getMRCA_new(phy, tips) 23.7694128.0250031.8428531.370 >> 0134.8964044.66380100 >> getMRCA_em_cmp(phy, tips) 20.8192423.6011828.2395728.252 >> 0230.1853559.72102100 >> getMRCA_new_cmp(phy, tips) 20.1658823.1957127.1961026.657 >> 8329.8214942.61702100 >> >> However, for the worst-case scenario that Klaus devised the results are >> very impressive: >> >> microbenchmark(getMRCA_em(tree, c(1,2)), getMRCA_new(tree, c(1,2)), >> getMRCA_em_cmp(tree, c(1,2)), getMRCA_new_cmp(tree, c(1,2))); >> Unit: milliseconds >> expr min lq mean median uq max neval >> getMRCA_em(tree, c(1, 2)) 124.79040140.44473146.98292145 >> .39224150.96044286.78335100 >> getMRCA_new(tree, c(1, 2)) 123.39924139.41609145.21311143 >> .92062148.34205297.73846100 >> getMRCA_em_cmp(tree, c(1, 2)) 23.7210526.8494429.9495029.879 >> 3532.2649041.99264100 >> getMRCA_new_cmp(tree, c(1, 2)) 23.1522325.7468629.2152629.253 >> 7030.8091569.68522100 >> >> So it might be worth compiling. >> >> HTH. >> JWB >> ________________________________________ >> Joseph W. Brown >> Post-doctoral Researcher, Smith Laboratory >> University of Michigan >> Department of Ecology & Evolutionary Biology >> Room 2071, Kraus Natural Sciences Building >> Ann Arbor MI 48109-1079 >> josep...@umich.edu <mailto:josep...@umich.edu> >> >> >> >> On 10 Jun, 2017, at 05:26, Emmanuel Paradis <emmanuel.para...@ird.fr >>> <mailto:emmanuel.para...@ird.fr>> wrote: >>> >>> Joseph, Klaus, >>> >>> This is great. I slightly edited the code and renamed the argument >>> 'tips' to 'tip' (as in the original definition). I run on a bunch of random >>> trees and the results are exactly identical with the current version of >>> getMRCA. I'm changing the code in ape now. >>> >>> Cheers, >>> >>> Emmanuel >>> >>> getMRCA <- function(phy, tip) >>> { >>> if (!inherits(phy, "phylo")) >>> stop('object "phy" is not of class "phylo"') >>> if (length(tip) < 2) return(NULL) >>> Ntip <- length(phy$tip.label) >>> rootnd <- Ntip + 1L >>> >>> pars <- integer(phy$Nnode) # worst case assignment, usually far too >>> long >>> mrcaind <- 1L # index in pars of the mrca node. will return highest >>> one traversed by other lineages >>> tnd <- if (is.character(tip)) match(tip, phy$tip.label) else tip >>> >>> ## build a lookup table to get parents faster >>> pvec <- integer(Ntip + phy$Nnode) >>> pvec[phy$edge[, 2]] <- phy$edge[, 1] >>> >>> ## get entire lineage for first tip >>> nd <- tnd[1] >>> for (k in 1:phy$Nnode) { >>> nd <- pvec[nd] >>> pars[k] <- nd >>> if (nd == rootnd) break >>> } >>> pars <- pars[1:k] # delete the rest >>> >>> ## traverse lineages for remaining tips, stop if hit common ancestor >>> for (i in 2:length(tnd)) { >>> done <- FALSE >>> cnd <- tnd[i] >>> while (!done) { >>> cpar <- pvec[cnd] # get immediate parent >>> if (cpar %in% pars) { >>> ## early exit! if the mrca of any set of taxa is the root >>> then we are done >>> if (cpar == rootnd) return(rootnd) >>> idx <- which(pars == cpar) >>> if (idx > mrcaind) mrcaind <- idx >>> done <- TRUE >>> } else cnd <- cpar # keep going! >>> } >>> } >>> pars[mrcaind] >>> } >>> >>> >>> Le 10/06/2017 à 00:28, Klaus Schliep a écrit : >>> >>>> Hi Joseph >>>> I guess I figured out where some room for improvement was. >>>> You grow the vector pars <- c(pars, nd) >>>> I preallocated pars, but you could even just grow pars using pars[k] <- >>>> nd >>>> without pre-allocation. This operation is now (with R >= 3.4) far less >>>> costly as mentioned in the NEWS: >>>> "Assigning to an element of a vector beyond the current length now >>>> over-allocates by a small fraction. The new vector is marked internally >>>> as >>>> growable, and the true length of the new vector is stored in the >>>> truelength >>>> field. This makes building up a vector result by assigning to the next >>>> element beyond the current length more efficient, though pre-allocating >>>> is >>>> still preferred. The implementation is subject to change and not >>>> intended >>>> to be used in packages at this time." >>>> One of the many little improvements recently making R quite a bit faster >>>> and easier. >>>> So the worst case behavior was pretty bad >>>> tree <- stree(100000, "right") >>>> system.time(get_mrca_fast(tree, c(1,2))) >>>> user system elapsed >>>> 13.424 0.012 13.440 >>>> mrca >>>> [1] 199999 >>>> with the changes it is much better: >>>> system.time(mrca <- get_mrca_fast(tree, c(1,2))) >>>> user system elapsed >>>> 0.012 0.000 0.012 >>>> >>>>> mrca >>>>> >>>> [1] 199999 >>>> And here is the code: >>>> get_mrca_fast <- function (phy, tips) { >>>> rootnd <- length(phy$tip.label) + 1; >>>> pars <- integer(phy$Nnode); # worst case assignment, usually far >>>> too >>>> long >>>> mrcaind <- 1; # index in pars of the mrca node. will return >>>> highest one traversed by other lineages >>>> tnd <- NULL; # node ids of tips >>>> if ("character" %in% class(tips)) { >>>> tnd <- match(tips, phy$tip.label); >>>> } else { >>>> tnd <- tips; >>>> } >>>> # build a lookup table to get parents faster >>>> pvec <- integer(max(phy$edge)); >>>> pvec[phy$edge[,2]] <- phy$edge[,1]; >>>> # get entire lineage for first tip >>>> nd <- tnd[1]; >>>> for(k in 1:phy$Nnode){ >>>> nd <- pvec[nd] >>>> pars[k] <- nd >>>> if (nd == rootnd) break >>>> } >>>> pars <- pars[1:k] #delete the rest >>>> # traverse lineages for remaining tips, stop if hit common ancestor >>>> for (i in 2:length(tnd)) { >>>> done <- FALSE; >>>> cnd <- tnd[i]; >>>> while (!done) { >>>> cpar <- pvec[cnd]; # get immediate parent >>>> if (cpar %in% pars) { >>>> if (cpar == rootnd) { >>>> # early exit! if the mrca of any set of taxa is the >>>> root then we are done >>>> return(rootnd); >>>> } >>>> idx <- which(pars == cpar); >>>> if (idx > mrcaind) { >>>> mrcaind <- idx; >>>> } >>>> done <- TRUE; >>>> } else { >>>> # keep going! >>>> cnd <- cpar; >>>> } >>>> } >>>> } >>>> return(pars[mrcaind]); >>>> } >>>> Have a nice weekend, >>>> Klaus >>>> On Fri, Jun 9, 2017 at 3:59 PM, Joseph W. Brown <josep...@umich.edu >>>> <mailto:josep...@umich.edu>> wrote: >>>> >>>>> Liam, >>>>> >>>>> I put the (updated) code up as a gist <https://gist.github.com/josep >>>>> hwb/ >>>>> ad61fd29ed4fb06e712e23d67422c813>. Feel free to use/improve/whatever. >>>>> Emmanuel sees room for improvement but frankly taking 0.3 seconds to >>>>> find >>>>> the MRCA of 5000 taxa on a million-tip tree is good enough for my >>>>> purposes. >>>>> >>>>> JWB >>>>> ________________________________________ >>>>> Joseph W. Brown >>>>> Post-doctoral Researcher, Smith Laboratory >>>>> University of Michigan >>>>> Department of Ecology & Evolutionary Biology >>>>> Room 2071, Kraus Natural Sciences Building >>>>> Ann Arbor MI 48109-1079 >>>>> josep...@umich.edu <mailto:josep...@umich.edu> >>>>> >>>>> >>>>> >>>>> >>>>> On 9 Jun, 2017, at 15:20, Liam J. Revell <liam.rev...@umb.edu> wrote: >>>>>> >>>>>> On the other hand, phytools does have a function - the somewhat >>>>>> >>>>> imprecisely named fastMRCA - which can find the MRCA of just a pair of >>>>> species much faster than getMRCA (however still slower than or only >>>>> about >>>>> as fast as Joseph & Klaus's solutions). >>>>> >>>>>> >>>>>> Liam J. Revell, Associate Professor of Biology >>>>>> University of Massachusetts Boston >>>>>> web: http://faculty.umb.edu/liam.revell/ >>>>>> email: liam.rev...@umb.edu >>>>>> blog: http://blog.phytools.org >>>>>> >>>>>> On 6/9/2017 5:22 AM, Liam J. Revell wrote: >>>>>> >>>>>>> >>>>>>> Juan. findMRCA was written before getMRCA existed, but the latter was >>>>>>> faster so now it just calls getMRCA internally. All the best, Liam >>>>>>> >>>>>>> Liam J. Revell, Associate Professor of Biology >>>>>>> University of Massachusetts Boston >>>>>>> web: http://faculty.umb.edu/liam.revell/ >>>>>>> email: liam.rev...@umb.edu >>>>>>> blog: http://blog.phytools.org >>>>>>> >>>>>>> On 6/9/2017 1:57 AM, Juan Antonio Balbuena wrote: >>>>>>> >>>>>>>> Package phytools includes a function, findMRCA, that is supposed to >>>>>>>> >>>>>>> work >>>>> >>>>>> very efficiently with large trees. you may wish to compare it with >>>>>>>> your >>>>>>>> function. >>>>>>>> >>>>>>>> Cheers >>>>>>>> >>>>>>>> Juan >>>>>>>> >>>>>>>> -- >>>>>>>> >>>>>>>> Dr. Juan A. Balbuena >>>>>>>> Cavanilles Institute of Biodiversity and Evolutionary Biology >>>>>>>> University of Valencia >>>>>>>> http://www.uv.es/~balbuena <http://www.uv.es/%7Ebalbuena> >>>>>>>> P.O. Box 22085 >>>>>>>> http://www.uv.es/cophylpaco >>>>>>>> <http://www.uv.es/cavanilles/zoomarin/index.htm> >>>>>>>> 46071 Valencia, Spain >>>>>>>> e-mail: j.a.balbu...@uv.es <mailto:j.a.balbu...@uv.es> tel. +34 >>>>>>>> 963 >>>>>>>> 543 658 fax +34 963 543 733 >>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>>>>>> *NOTE!*For shipments by EXPRESS COURIER use the following street >>>>>>>> >>>>>>> address: >>>>> >>>>>> C/ Catedrático José Beltrán 2, 46980 Paterna (Valencia), Spain. >>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>>>>>> >>>>>>>> >>>>>>>> <https://www.avast.com/sig-email?utm_medium=email&utm_ >>>>>>>> >>>>>>> source=link&utm_campaign=sig-email&utm_content=emailclient> >>>>> >>>>>> >>>>>>>> Libre de virus. www.avast.com >>>>>>>> <https://www.avast.com/sig-email?utm_medium=email&utm_ >>>>>>>> >>>>>>> source=link&utm_campaign=sig-email&utm_content=emailclient> >>>>> >>>>>> >>>>>>>> >>>>>>>> >>>>>>>> <#DAB4FAD8-2DD7-40BB-A1B8-4E2AA1F9FDF2> >>>>>>>> >>>>>>>> >>>>>>>> _______________________________________________ >>>>>>>> R-sig-phylo mailing list - R-sig-phylo@r-project.org >>>>>>>> https://stat.ethz.ch/mailman/listinfo/r-sig-phylo >>>>>>>> Searchable archive at >>>>>>>> http://www.mail-archive.com/r-sig-phylo@r-project.org/ >>>>>>>> >>>>>>>> >>>>>>> _______________________________________________ >>>>>>> R-sig-phylo mailing list - R-sig-phylo@r-project.org >>>>>>> https://stat.ethz.ch/mailman/listinfo/r-sig-phylo >>>>>>> Searchable archive at >>>>>>> http://www.mail-archive.com/r-sig-phylo@r-project.org/ >>>>>>> >>>>>> >>>>> >>>>> [[alternative HTML version deleted]] >>>>> >>>>> _______________________________________________ >>>>> R-sig-phylo mailing list - R-sig-phylo@r-project.org >>>>> https://stat.ethz.ch/mailman/listinfo/r-sig-phylo >>>>> Searchable archive at http://www.mail-archive.com/r- >>>>> sig-ph...@r-project.org/ >>>>> >>>>> >> >> >> -- Klaus Schliep Postdoctoral Fellow Revell Lab, University of Massachusetts Boston http://www.phangorn.org/ [[alternative HTML version deleted]] _______________________________________________ R-sig-phylo mailing list - R-sig-phylo@r-project.org https://stat.ethz.ch/mailman/listinfo/r-sig-phylo Searchable archive at http://www.mail-archive.com/r-sig-phylo@r-project.org/