This is an automated email from the git hooks/post-receive script. tille pushed a commit to branch master in repository pvclust.
commit 7c70de8c2ac6d761308d246a9ad0ae68d00ed543 Author: Andreas Tille <[email protected]> Date: Wed Nov 4 16:47:19 2015 +0100 Imported Upstream version 2.0-0 --- DESCRIPTION | 8 +- MD5 | 10 +- NAMESPACE | 6 + R/pvclust-internal.R | 347 ++++++++++++++++++++++++++------- R/pvclust.R | 536 +++++++++++++++++++++++---------------------------- man/pvclust.Rd | 79 +++++--- 6 files changed, 590 insertions(+), 396 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a6b4595..d387b17 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pvclust -Version: 1.3-2 -Date: 2014-12-19 +Version: 2.0-0 +Date: 2015-10-23 Title: Hierarchical Clustering with P-Values via Multiscale Bootstrap Resampling Author: Ryota Suzuki <[email protected]>, Hidetoshi Shimodaira @@ -14,7 +14,7 @@ Description: An implementation of multiscale bootstrap resampling for BP (bootstrap probability) value for each cluster in a dendrogram. License: GPL (>= 2) URL: http://www.sigmath.es.osaka-u.ac.jp/shimo-lab/prog/pvclust/ -Packaged: 2014-12-19 05:24:46 UTC; suzuki NeedsCompilation: no +Packaged: 2015-10-23 11:28:03 UTC; suzuki Repository: CRAN -Date/Publication: 2014-12-22 06:28:55 +Date/Publication: 2015-10-23 16:23:16 diff --git a/MD5 b/MD5 index d3a054d..993a62a 100644 --- a/MD5 +++ b/MD5 @@ -1,13 +1,13 @@ -c8d270dc7740a01321089f869aa222d8 *DESCRIPTION -97c5dfc9e62e9ba7cd789609bc5d7f99 *NAMESPACE -78aaa368d0e21dfa99912aeb0e1031fc *R/pvclust-internal.R -667584464eb31a68d58f274d440f01ad *R/pvclust.R +a8cfdf20f013a1782f62540de8c31cc7 *DESCRIPTION +5a968cf1c9f480e1f6ddf26248f54b56 *NAMESPACE +7a00ed1667f341eaa681f8d9d3d058d9 *R/pvclust-internal.R +75d16c5ca32ecd0a133b5764479989e1 *R/pvclust.R 4ce445fdf8be068ed0f770d3c7bafd17 *data/lung.RData 4dd897fecd4b0175cf3318af419bebab *man/lung.Rd 25915dafebbf8ed6fb9234776ae7349c *man/msfit.Rd f2b43095e239bcd8edd83cb0d6b6352c *man/msplot.Rd c2c1b6dbd2843eeccbee5ec24da11b49 *man/plot.pvclust.Rd 1a64c1ea3c377fad1a8d81c67c34f0a7 *man/print.pvclust.Rd -10d182003a1d3ab8ab43b6a0f3759974 *man/pvclust.Rd +4c4235b3d254542f783980bcde9b03d2 *man/pvclust.Rd cfa83769e276f25bdb5c83b430553af5 *man/pvpick.Rd 3ab1fb9eebdc5f8b3915d71258bee39c *man/seplot.Rd diff --git a/NAMESPACE b/NAMESPACE index b276cec..56cb754 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,3 +16,9 @@ S3method(plot, msfit) S3method(lines, msfit) S3method(summary, msfit) +importFrom("grDevices", "n2mfrow") +importFrom("graphics", "curve", "lines", "par", "plot", "rect", + "segments", "text") +importFrom("stats", "as.dist", "cor", "dist", "dnorm", "hclust", + "lsfit", "na.omit", "pchisq", "pnorm", "qnorm", "rmultinom") +importFrom("utils", "capture.output", "head", "packageVersion") diff --git a/R/pvclust-internal.R b/R/pvclust-internal.R index 22d6417..e636aca 100644 --- a/R/pvclust-internal.R +++ b/R/pvclust-internal.R @@ -1,3 +1,176 @@ +### internal function for non-parallel pvclust +pvclust.nonparallel <- function(data, method.hclust, method.dist, use.cor, nboot, r, + store, weight, iseed, quiet) +{ + # initialize random seed + if(!is.null(iseed)) + set.seed(seed = iseed) + + # data: (n,p) matrix, n-samples, p-variables + n <- nrow(data); p <- ncol(data) + + # hclust for original data + # METHODS <- c("ward", "single", "complete", "average", "mcquitty", + # "median", "centroid") + # method.hclust <- METHODS[pmatch(method.hclust, METHODS)] + + # Use custom distance function + if(is.function(method.dist)) { + distance <- method.dist(data) + } else { + distance <- dist.pvclust(data, method=method.dist, use.cor=use.cor) + } + + data.hclust <- hclust(distance, method=method.hclust) + + # ward -> ward.D + # only if R >= 3.1.0 + if(method.hclust == "ward" && getRversion() >= '3.1.0') { + method.hclust <- "ward.D" + } + + # multiscale bootstrap + size <- floor(n*r) + rl <- length(size) + + if(rl == 1) { + if(r != 1.0) + warning("Relative sample size r is set to 1.0. AU p-values are not calculated\n") + + r <- list(1.0) + } + else + r <- as.list(size/n) + + mboot <- lapply(r, boot.hclust, data=data, object.hclust=data.hclust, nboot=nboot, + method.dist=method.dist, use.cor=use.cor, + method.hclust=method.hclust, store=store, weight=weight, quiet=quiet) + + result <- pvclust.merge(data=data, object.hclust=data.hclust, mboot=mboot) + + return(result) +} + + +### internal function for parallel pvclust +pvclust.parallel <- function(cl, data, method.hclust, method.dist, use.cor, + nboot, r, store, weight, init.rand=NULL, iseed, quiet, + parallel.check) +{ + if(parallel.check) { + check.result <- check.parallel(cl=cl, nboot=nboot) + if(!check.result) { + msg <- paste(attr(check.result, "msg"), ". non-parallel version is executed", sep = "") + warning(msg) + return(pvclust.nonparallel(data=data, method.hclust=method.hclust, method.dist=method.dist, + use.cor=use.cor, nboot=nboot, r=r, store=store, weight=weight, iseed=iseed, quiet=quiet)) + } + } + + # check package versions + pkg.ver <-parallel::clusterCall(cl, packageVersion, pkg = "pvclust") + r.ver <- parallel::clusterCall(cl, getRversion) + if(length(unique(pkg.ver)) > 1 || length(unique(r.ver)) > 1) { + + node.name <- parallel::clusterEvalQ(cl, Sys.info()["nodename"]) + version.table <- data.frame( + node=seq_len(length(node.name)), + name=unlist(node.name), + R=unlist(lapply(r.ver, as.character)), + pvclust=unlist(lapply(pkg.ver, as.character))) + + if(nrow(version.table) > 10) + table.out <- c(capture.output(print(head(version.table, n=10), row.names=FALSE)), " ...") + else + table.out <- capture.output(print(version.table, row.names=FALSE)) + + warning("R/pvclust versions are not unique:\n", + paste(table.out, collapse="\n")) + } + + if(!is.null(init.rand)) + warning("\"init.rand\" option is deprecated. It is available for back compatibility but will be unavailable in the future.\nSpecify a non-NULL value of \"iseed\" to initialize random seed.") + + # if(init.rand) { + # if(is.null(iseed) && !is.null(seed)) { + # warning("\"seed\" option is deprecated. It is available for back compatibility but will be unavailable in the future.\nConsider using \"iseed\" instead.") + # + # if(length(seed) != length(cl)) + # stop("seed and cl should have the same length.") + # + # # setting random seeds + # parallel::parLapply(cl, as.list(seed), set.seed) + # } else { + # parallel::clusterSetRNGStream(cl = cl, iseed = iseed) + # } + # } + + if(!is.null(iseed) && (is.null(init.rand) || init.rand)) + parallel::clusterSetRNGStream(cl = cl, iseed = iseed) + + # data: (n,p) matrix, n-samples, p-variables + n <- nrow(data); p <- ncol(data) + + # hclust for original data + if(is.function(method.dist)) { + # Use custom distance function + distance <- method.dist(data) + } else { + distance <- dist.pvclust(data, method=method.dist, use.cor=use.cor) + } + + data.hclust <- hclust(distance, method=method.hclust) + + # ward -> ward.D + # only if R >= 3.1.0 + if(method.hclust == "ward" && getRversion() >= '3.1.0') { + method.hclust <- "ward.D" + } + + # multiscale bootstrap + size <- floor(n*r) + rl <- length(size) + + if(rl == 1) { + if(r != 1.0) + warning("Relative sample size r is set to 1.0. AU p-values are not calculated\n") + + r <- list(1.0) + } + else + r <- as.list(size/n) + + ncl <- length(cl) + nbl <- as.list(rep(nboot %/% ncl,times=ncl)) + + if((rem <- nboot %% ncl) > 0) + nbl[1:rem] <- lapply(nbl[1:rem], "+", 1) + + if(!quiet) + cat("Multiscale bootstrap... ") + + mlist <- parallel::parLapply(cl, nbl, pvclust.node, + r=r, data=data, object.hclust=data.hclust, method.dist=method.dist, + use.cor=use.cor, method.hclust=method.hclust, + store=store, weight=weight, quiet=quiet) + if(!quiet) + cat("Done.\n") + + mboot <- mlist[[1]] + + for(i in 2:ncl) { + for(j in 1:rl) { + mboot[[j]]$edges.cnt <- mboot[[j]]$edges.cnt + mlist[[i]][[j]]$edges.cnt + mboot[[j]]$nboot <- mboot[[j]]$nboot + mlist[[i]][[j]]$nboot + mboot[[j]]$store <- c(mboot[[j]]$store, mlist[[i]][[j]]$store) + } + } + + result <- pvclust.merge( data=data, object.hclust=data.hclust, mboot=mboot) + + return(result) +} + hc2axes <- function(x) { A <- x$merge # (n,n-1) matrix @@ -7,87 +180,88 @@ hc2axes <- function(x) x.tmp <- rep(0,2) zz <- match(1:length(x$order),x$order) - - for(i in 1:(n-1)) { - ai <- A[i,1] - - if(ai < 0) - x.tmp[1] <- zz[-ai] - else - x.tmp[1] <- x.axis[ai] - - ai <- A[i,2] - - if(ai < 0) - x.tmp[2] <- zz[-ai] - else - x.tmp[2] <- x.axis[ai] - - x.axis[i] <- mean(x.tmp) - } + + for(i in 1:(n-1)) { + ai <- A[i,1] + + if(ai < 0) + x.tmp[1] <- zz[-ai] + else + x.tmp[1] <- x.axis[ai] + + ai <- A[i,2] + + if(ai < 0) + x.tmp[2] <- zz[-ai] + else + x.tmp[2] <- x.axis[ai] + + x.axis[i] <- mean(x.tmp) + } return(data.frame(x.axis=x.axis,y.axis=y.axis)) } hc2split <- function(x) - { - A <- x$merge # (n-1,n) matrix - n <- nrow(A) + 1 - B <- list() - - for(i in 1:(n-1)){ - ai <- A[i,1] - - if(ai < 0) - B[[i]] <- -ai - else - B[[i]] <- B[[ai]] - - ai <- A[i,2] - - if(ai < 0) - B[[i]] <- sort(c(B[[i]],-ai)) - else - B[[i]] <- sort(c(B[[i]],B[[ai]])) - } - - CC <- matrix(rep(0,n*(n-1)),nrow=(n-1),ncol=n) +{ + A <- x$merge # (n-1,n) matrix + n <- nrow(A) + 1 + B <- list() + + for(i in 1:(n-1)){ + ai <- A[i,1] - for(i in 1:(n-1)){ - bi <- B[[i]] - m <- length(bi) - for(j in 1:m) - CC[i,bi[j]] <- 1 - } - - split <- list(pattern=apply(CC,1,paste,collapse=""), member=B) + if(ai < 0) + B[[i]] <- -ai + else + B[[i]] <- B[[ai]] + + ai <- A[i,2] - return(split) + if(ai < 0) + B[[i]] <- sort(c(B[[i]],-ai)) + else + B[[i]] <- sort(c(B[[i]],B[[ai]])) } - -pvclust.node <- function(x, r,...) - { -# require(pvclust) - mboot.node <- lapply(r, boot.hclust, nboot=x, ...) - return(mboot.node) + + CC <- matrix(rep(0,n*(n-1)),nrow=(n-1),ncol=n) + + for(i in 1:(n-1)){ + bi <- B[[i]] + m <- length(bi) + for(j in 1:m) + CC[i,bi[j]] <- 1 } + + split <- list(pattern=apply(CC,1,paste,collapse=""), member=B) + + return(split) +} + +pvclust.node <- function(x, r, ...) +{ + # require(pvclust) + mboot.node <- lapply(r, boot.hclust, nboot=x, ...) + return(mboot.node) +} boot.hclust <- function(r, data, object.hclust, method.dist, use.cor, - method.hclust, nboot, store, weight=F) + method.hclust, nboot, store, weight=FALSE, quiet=FALSE) { n <- nrow(data) size <- round(n*r, digits=0) if(size == 0) stop("invalid scale parameter(r)") r <- size/n - + pattern <- hc2split(object.hclust)$pattern edges.cnt <- table(factor(pattern)) - table(factor(pattern)) st <- list() # bootstrap start rp <- as.character(round(r,digits=2)); if(r == 1) rp <- paste(rp,".0",sep="") - cat(paste("Bootstrap (r = ", rp, ")... ", sep="")) + if(!quiet) + cat(paste("Bootstrap (r = ", rp, ")... ", sep="")) w0 <- rep(1,n) # equal weight na.flag <- 0 @@ -97,7 +271,11 @@ boot.hclust <- function(r, data, object.hclust, method.dist, use.cor, suppressWarnings(distance <- distw.pvclust(data,w1,method=method.dist,use.cor=use.cor)) } else { smpl <- sample(1:n, size, replace=TRUE) - suppressWarnings(distance <- dist.pvclust(data[smpl,],method=method.dist,use.cor=use.cor)) + if(is.function(method.dist)) { + suppressWarnings(distance <- method.dist(data[smpl,])) + } else { + suppressWarnings(distance <- dist.pvclust(data[smpl,],method=method.dist,use.cor=use.cor)) + } } if(all(is.finite(distance))) { # check if distance is valid x.hclust <- hclust(distance,method=method.hclust) @@ -105,18 +283,19 @@ boot.hclust <- function(r, data, object.hclust, method.dist, use.cor, edges.cnt <- edges.cnt + table(factor(pattern.i, levels=pattern)) } else { x.hclust <- NULL - na.flag <- 1 + na.flag <- 1 } - + if(store) st[[i]] <- x.hclust } - cat("Done.\n") + if(!quiet) + cat("Done.\n") # bootstrap done if(na.flag == 1) - warning(paste("inappropriate distance matrices are omitted in computation: r = ", r), call.=FALSE) - + warning(paste("inappropriate distance matrices are omitted in computation: r = ", r), call.=FALSE) + boot <- list(edges.cnt=edges.cnt, method.dist=method.dist, use.cor=use.cor, method.hclust=method.hclust, nboot=nboot, size=size, r=r, store=st) class(boot) <- "boot.hclust" @@ -138,7 +317,7 @@ pvclust.merge <- function(data, object.hclust, mboot){ edges.bp <- edges.cnt <- data.frame(matrix(rep(0,ne*rl),nrow=ne,ncol=rl)) row.names(edges.bp) <- pattern names(edges.cnt) <- paste("r", 1:rl, sep="") - + for(j in 1:rl) { edges.cnt[,j] <- as.vector(mboot[[j]]$edges.cnt) edges.bp[,j] <- edges.cnt[,j] / nboot[j] @@ -164,12 +343,12 @@ pvclust.merge <- function(data, object.hclust, mboot){ edges.pv <- data.frame(au=au, bp=bp, se.au=se.au, se.bp=se.bp, v=v, c=cc, pchi=pchi) - + row.names(edges.pv) <- row.names(edges.cnt) <- 1:ne - + result <- list(hclust=object.hclust, edges=edges.pv, count=edges.cnt, msfit=ms.fitted, nboot=nboot, r=r, store=store) - + class(result) <- "pvclust" return(result) } @@ -206,18 +385,18 @@ dist.pvclust <- function(x, method="euclidean", use.cor="pairwise.complete.obs") corw <- function(x,w, use=c("all.obs","complete.obs","pairwise.complete.obs") - ) { +) { if(is.data.frame(x)) x <- as.matrix(x) x <- x[w>0,,drop=F] w <- w[w>0] - + n <- nrow(x) # sample size m <- ncol(x) # number of variables if(missing(w)) w <- rep(1,n) r <- matrix(0,m,m,dimnames=list(colnames(x),colnames(x))) diag(r) <- 1 use <- match.arg(use) - + pairu <- F if(use=="all.obs") { u <- rep(T,n) @@ -268,3 +447,29 @@ distw.pvclust <- function(x,w,method="correlation", use.cor="pairwise.complete.o } stop("wrong method") } + +### check whether parallel computation is appropriate +check.parallel <- function(cl, nboot) { + res <- FALSE + +### will be used when defaultCluster(cl) becomes publicly available +# # check whether cl is a cluster, or a default cluster is available +# if(!inherits(cl, "cluster")) { +# try_result <- try(cl <- parallel:::defaultCluster(cl), silent=TRUE) +# if(class(try_result) == "try-error") { +# attr(res, "msg" <- "cl is not a cluster") +# return(res) +# } +# } + + ncl <- length(cl) + if(ncl < 2) { + attr(res, "msg") <- "Cluster size is too small (or NULL)" + } else if (ncl > nboot) { + attr(res, "msg") <- "nboot is too small for cluster size" + } else { + res <- TRUE + } + + return(res) +} \ No newline at end of file diff --git a/R/pvclust.R b/R/pvclust.R index 74bbc61..37e791e 100644 --- a/R/pvclust.R +++ b/R/pvclust.R @@ -1,41 +1,82 @@ -pvclust <- function(data, method.hclust="average", - method.dist="correlation", use.cor="pairwise.complete.obs", - nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE) - { - # data: (n,p) matrix, n-samples, p-variables - n <- nrow(data); p <- ncol(data) - - # hclust for original data -# METHODS <- c("ward", "single", "complete", "average", "mcquitty", -# "median", "centroid") -# method.hclust <- METHODS[pmatch(method.hclust, METHODS)] - distance <- dist.pvclust(data, method=method.dist, use.cor=use.cor) - data.hclust <- hclust(distance, method=method.hclust) - - # ward -> ward.D - if(method.hclust == "ward") method.hclust <- "ward.D" - - # multiscale bootstrap - size <- floor(n*r) - rl <- length(size) +pvclust <- function(data, method.hclust="average", method.dist="correlation", + use.cor="pairwise.complete.obs", nboot=1000, parallel=FALSE, + r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE, iseed=NULL, quiet=FALSE) +{ + p <- parallel + + if(is.null(p) || (!is.logical(p) && (!is.integer(p) || p <= 0) && !inherits(p, "cluster"))) + stop("parallel should be a logical, an integer or a cluster object.") + + if(is.logical(p)) { + par.flag <- p + par.size <- NULL + cl <- NULL + } else if(is.integer(p)) { + par.flag <- TRUE + par.size <- p + cl <- NULL + } else if(inherits(p, "cluster")) { + par.flag <- TRUE + cl <- p + } + + if(par.flag && !requireNamespace("parallel", quietly=TRUE)) { + warning("Package parallel is required for parallel computation. Use non-parallel mode instead.") + par.flag <- FALSE + } + + if(par.flag) { - if(rl == 1) { - if(r != 1.0) - warning("Relative sample size r is set to 1.0. AU p-values are not calculated\n") + if(is.null(cl)) { + if(is.null(par.size)) + par.size <- parallel::detectCores() - 1 + + if(!quiet) + cat("Creating a temporary cluster...") + try_result <- try(cl <- parallel::makePSOCKcluster(par.size)) + + if(inherits(try_result, "try-error")) { + if(!quiet) + cat("failed to create a cluster. Use non-parallel mode instead.") + par.flag <- FALSE + } else { + if(!quiet) { + cat("done:\n") + print(cl) + } + on.exit(parallel::stopCluster(cl)) + } + - r <- list(1.0) } - else - r <- as.list(size/n) - - mboot <- lapply(r, boot.hclust, data=data, object.hclust=data.hclust, nboot=nboot, - method.dist=method.dist, use.cor=use.cor, - method.hclust=method.hclust, store=store, weight=weight) - result <- pvclust.merge(data=data, object.hclust=data.hclust, mboot=mboot) + pvclust.parallel(cl=cl, data=data, method.hclust=method.hclust, + method.dist=method.dist, use.cor=use.cor, + nboot=nboot, r=r, store=store, weight=weight, + iseed=iseed, quiet=quiet, parallel.check=TRUE) - return(result) + } else { + pvclust.nonparallel(data=data, method.hclust=method.hclust, + method.dist=method.dist, use.cor=use.cor, + nboot=nboot, r=r, store=store, weight=weight, iseed=iseed, quiet=quiet) } +} + +parPvclust <- function(cl=NULL, data, method.hclust="average", + method.dist="correlation", use.cor="pairwise.complete.obs", + nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, + weight=FALSE, init.rand=NULL, iseed=NULL, quiet=FALSE) { + warning("\"parPvclust\" has been integrated into pvclust (with \"parallel\" option).\nIt is available for back compatibility but will be unavailable in the future.") + + if(!requireNamespace("parallel", quietly=TRUE)) + stop("Package parallel is required for parPvclust.") + + pvclust.parallel(cl=cl, data=data, method.hclust=method.hclust, + method.dist=method.dist, use.cor=use.cor, + nboot=nboot, r=r, store=store, weight=weight, + init.rand=init.rand, iseed=iseed, quiet=quiet, + parallel.check=TRUE) +} plot.pvclust <- function(x, print.pv=TRUE, print.num=TRUE, float=0.01, col.pv=c(2,3,8), cex.pv=0.8, font.pv=NULL, @@ -50,10 +91,10 @@ plot.pvclust <- function(x, print.pv=TRUE, print.num=TRUE, float=0.01, if(is.null(xlab)) xlab=paste("Distance: ", x$hclust$dist.method) - + plot(x$hclust, main=main, sub=sub, xlab=xlab, col=col, cex=cex, font=font, lty=lty, lwd=lwd, ...) - + if(print.pv) text(x, col=col.pv, cex=cex.pv, font=font.pv, float=float, print.num=print.num) } @@ -94,279 +135,192 @@ summary.pvclust <- function(object, ...){ } pvrect <- function(x, alpha=0.95, pv="au", type="geq", max.only=TRUE, border=2, ...) +{ + len <- nrow(x$edges) + member <- hc2split(x$hclust)$member + order <- x$hclust$order + usr <- par("usr") + xwd <- usr[2] - usr[1] + ywd <- usr[4] - usr[3] + cin <- par()$cin + + ht <- c() + j <- 1 + + if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt")))) + stop("Invalid type argument: see help(pvrect)") + + for(i in (len - 1):1) { - len <- nrow(x$edges) - member <- hc2split(x$hclust)$member - order <- x$hclust$order - usr <- par("usr") - xwd <- usr[2] - usr[1] - ywd <- usr[4] - usr[3] - cin <- par()$cin - - ht <- c() - j <- 1 - - if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt")))) - stop("Invalid type argument: see help(pvrect)") + if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or EQuals + else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or EQuals + else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than + else if(pm==4) wh <- (x$edges[i,pv] > alpha) # Lower Than - for(i in (len - 1):1) + if(wh) + { + mi <- member[[i]] + ma <- match(mi, order) + + if(max.only == FALSE || (max.only && sum(match(ma, ht, nomatch=0)) == 0)) { - if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or EQuals - else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or EQuals - else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than - else if(pm==4) wh <- (x$edges[i,pv] > alpha) # Lower Than - - if(wh) - { - mi <- member[[i]] - ma <- match(mi, order) - - if(max.only == FALSE || (max.only && sum(match(ma, ht, nomatch=0)) == 0)) - { - xl <- min(ma) - xr <- max(ma) - yt <- x$hclust$height[i] - yb <- usr[3] - - mx <- xwd / length(member) / 3 - my <- ywd / 200 - - rect(xl - mx, yb + my, xr + mx, yt + my, border=border, shade=NULL, ...) - - j <- j + 1 - } - ht <- c(ht, ma) - } + xl <- min(ma) + xr <- max(ma) + yt <- x$hclust$height[i] + yb <- usr[3] + + mx <- xwd / length(member) / 3 + my <- ywd / 200 + + rect(xl - mx, yb + my, xr + mx, yt + my, border=border, shade=NULL, ...) + + j <- j + 1 } + ht <- c(ht, ma) + } } +} msplot <- function(x, edges=NULL, ...) - { - if(is.null(edges)) edges <- 1:length(x$msfit) - d <- length(edges) - - mfrow.bak <- par()$mfrow - on.exit(par(mfrow=mfrow.bak)) - - par(mfrow=n2mfrow(d)) - - for(i in edges) { - if(i == 1 || (i %% 10 == 1 && i > 20)) - main <- paste(i, "st edge", sep="") - else if(i == 2 || (i %% 10 == 2 && i > 20)) - main <- paste(i, "nd edge", sep="") - else if(i == 3 || (i %% 10 == 3 && i > 20)) - main <- paste(i, "rd edge", sep="") - else - main <- paste(i, "th edge", sep="") - - plot(x$msfit[[i]], main=main, ...) - } +{ + if(is.null(edges)) edges <- 1:length(x$msfit) + d <- length(edges) + + mfrow.bak <- par()$mfrow + on.exit(par(mfrow=mfrow.bak)) + + par(mfrow=n2mfrow(d)) + + for(i in edges) { + if(i == 1 || (i %% 10 == 1 && i > 20)) + main <- paste(i, "st edge", sep="") + else if(i == 2 || (i %% 10 == 2 && i > 20)) + main <- paste(i, "nd edge", sep="") + else if(i == 3 || (i %% 10 == 3 && i > 20)) + main <- paste(i, "rd edge", sep="") + else + main <- paste(i, "th edge", sep="") + + plot(x$msfit[[i]], main=main, ...) } +} lines.pvclust <- function(x, alpha=0.95, pv="au", type="geq", col=2, lwd=2, ...) +{ + len <- nrow(x$edges) + member <- hc2split(x$hclust)$member + order <- x$hclust$order + usr <- par("usr") + xwd <- usr[2] - usr[1] + ywd <- usr[4] - usr[3] + cin <- par()$cin + + ht <- c() + j <- 1 + + if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt")))) + stop("Invalid type argument: see help(lines.pvclust)") + + for(i in (len - 1):1) { - len <- nrow(x$edges) - member <- hc2split(x$hclust)$member - order <- x$hclust$order - usr <- par("usr") - xwd <- usr[2] - usr[1] - ywd <- usr[4] - usr[3] - cin <- par()$cin - - ht <- c() - j <- 1 - - if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt")))) - stop("Invalid type argument: see help(lines.pvclust)") + if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or EQuals + else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or EQuals + else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than + else if(pm==4) wh <- (x$edges[i,pv] > alpha) # Lower Than - for(i in (len - 1):1) + if(wh) + { + mi <- member[[i]] + ma <- match(mi, order) + + if(sum(match(ma, ht, nomatch=0)) == 0) { - if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or EQuals - else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or EQuals - else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than - else if(pm==4) wh <- (x$edges[i,pv] > alpha) # Lower Than - - if(wh) - { - mi <- member[[i]] - ma <- match(mi, order) - - if(sum(match(ma, ht, nomatch=0)) == 0) - { - xl <- min(ma) - xr <- max(ma) - yt <- x$hclust$height[i] - yb <- usr[3] - - mx <- xwd/length(member)/10 - - segments(xl-mx, yb, xr+mx, yb, xpd=TRUE, col=col, lwd=lwd, ...) - - j <- j + 1 - } - ht <- c(ht, ma) - } + xl <- min(ma) + xr <- max(ma) + yt <- x$hclust$height[i] + yb <- usr[3] + + mx <- xwd/length(member)/10 + + segments(xl-mx, yb, xr+mx, yb, xpd=TRUE, col=col, lwd=lwd, ...) + + j <- j + 1 } + ht <- c(ht, ma) + } } +} pvpick <- function(x, alpha=0.95, pv="au", type="geq", max.only=TRUE) +{ + len <- nrow(x$edges) + member <- hc2split(x$hclust)$member + order <- x$hclust$order + + ht <- c() + a <- list(clusters=list(), edges=c()); j <- 1 + + if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt")))) + stop("Invalid type argument: see help(pickup)") + + for(i in (len - 1):1) { - len <- nrow(x$edges) - member <- hc2split(x$hclust)$member - order <- x$hclust$order - - ht <- c() - a <- list(clusters=list(), edges=c()); j <- 1 - - if(is.na(pm <- pmatch(type, c("geq", "leq", "gt", "lt")))) - stop("Invalid type argument: see help(pickup)") + if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or Equals + else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or Equals + else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than + else if(pm==4) wh <- (x$edges[i,pv] < alpha) # Lower Than - for(i in (len - 1):1) - { - if (pm==1) wh <- (x$edges[i,pv] >= alpha) # Greater than or Equals - else if(pm==2) wh <- (x$edges[i,pv] <= alpha) # Lower than or Equals - else if(pm==3) wh <- (x$edges[i,pv] > alpha) # Greater Than - else if(pm==4) wh <- (x$edges[i,pv] < alpha) # Lower Than - - if(wh) - { - mi <- member[[i]] - ma <- match(mi, order) - - if(max.only == FALSE || (max.only && sum(match(ma, ht, nomatch=0)) == 0)) - { - a$clusters[[j]] <- x$hclust$labels[mi] - a$edges <- c(a$edges,i) - - j <- j + 1 - } - ht <- c(ht, ma) - } - } - - a$edges <- a$edges[length(a$edges):1] - a$clusters <- a$clusters[length(a$edges):1] - - return(a) - } - -parPvclust <- function(cl=NULL, data, method.hclust="average", - method.dist="correlation", use.cor="pairwise.complete.obs", - nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, - weight=FALSE, - init.rand=TRUE, seed=NULL, iseed=NULL) - { - # if(!(require(snow))) stop("Package snow is required for parPvclust.") - if(!(require(parallel))) stop("Package parallel is required for parPvclust.") - - if((ncl <- length(cl)) < 2 || ncl > nboot) { - if(ncl > nboot) - warning("Too small value for nboot: non-parallel version is executed.") - else - warning("Too small (or NULL) cluster: non-parallel version is executed.") + if(wh) + { + mi <- member[[i]] + ma <- match(mi, order) - return(pvclust(data,method.hclust,method.dist,use.cor,nboot,r,store)) - } - - if(init.rand) { - if(is.null(iseed) && !is.null(seed)) { - warning("\"seed\" option is deprecated. It is available for back compatibility but will be unavailable in the future.\nConsider using \"iseed\" instead.") - - if(length(seed) != length(cl)) - stop("seed and cl should have the same length.") + if(max.only == FALSE || (max.only && sum(match(ma, ht, nomatch=0)) == 0)) + { + a$clusters[[j]] <- x$hclust$labels[mi] + a$edges <- c(a$edges,i) - # setting random seeds - parallel::parLapply(cl, as.list(seed), set.seed) - } else { - parallel::clusterSetRNGStream(cl = cl, iseed = iseed) - } - } - - # data: (n,p) matrix, n-samples, p-variables - n <- nrow(data); p <- ncol(data) - - # hclust for original data - #METHODS <- c("ward", "single", "complete", "average", "mcquitty", - # "median", "centroid") - #method.hclust <- METHODS[pmatch(method.hclust, METHODS)] - - distance <- dist.pvclust(data, method=method.dist, use.cor=use.cor) - data.hclust <- hclust(distance, method=method.hclust) - - # ward -> ward.D - if(method.hclust == "ward") method.hclust <- "ward.D" - - # multiscale bootstrap - size <- floor(n*r) - rl <- length(size) - - if(rl == 1) { - if(r != 1.0) - warning("Relative sample size r is set to 1.0. AU p-values are not calculated\n") - - r <- list(1.0) - } - else - r <- as.list(size/n) - - nbl <- as.list(rep(nboot %/% ncl,times=ncl)) - - if((rem <- nboot %% ncl) > 0) - nbl[1:rem] <- lapply(nbl[1:rem], "+", 1) - - cat("Multiscale bootstrap... ") - - mlist <- parallel::parLapply(cl, nbl, pvclust.node, - r=r, data=data, object.hclust=data.hclust, method.dist=method.dist, - use.cor=use.cor, method.hclust=method.hclust, - store=store, weight=weight) - cat("Done.\n") - - mboot <- mlist[[1]] - - for(i in 2:ncl) { - for(j in 1:rl) { - mboot[[j]]$edges.cnt <- mboot[[j]]$edges.cnt + mlist[[i]][[j]]$edges.cnt - mboot[[j]]$nboot <- mboot[[j]]$nboot + mlist[[i]][[j]]$nboot - mboot[[j]]$store <- c(mboot[[j]]$store, mlist[[i]][[j]]$store) + j <- j + 1 } + ht <- c(ht, ma) } - - result <- pvclust.merge( data=data, object.hclust=data.hclust, mboot=mboot) - - return(result) } + + a$edges <- a$edges[length(a$edges):1] + a$clusters <- a$clusters[length(a$edges):1] + + return(a) +} msfit <- function(bp, r, nboot) { - + if(length(bp) != length(r)) stop("bp and r should have the same length") - + nboot <- rep(nboot, length=length(bp)) - + use <- bp > 0 & bp < 1 - + p <- se <- c(0,0); names(p) <- names(se) <- c("au", "bp") coef <- c(0,0); names(coef) <- c("v", "c") - + a <- list(p=p, se=se, coef=coef, df=0, rss=0, pchi=0); class(a) <- "msfit" - + if(sum(use) < 2) { # if(mean(bp) < .5) a$p[] <- c(0, 0) else a$p[] <- c(1, 1) if(mean(bp) < .5) a$p[] <- c(0, bp[r==1.0]) else a$p[] <- c(1, bp[r==1.0]) return(a) } - + bp <- bp[use]; r <- r[use]; nboot <- nboot[use] zz <- -qnorm(bp) vv <- ((1 - bp) * bp) / (dnorm(zz)^2 * nboot) a$use <- use; a$r <- r; a$zz <- zz - + X <- cbind(sqrt(r), 1/sqrt(r)); dimnames(X) <- list(NULL, c("v","c")) fit <- lsfit(X, zz, 1/vv, intercept=FALSE) a$coef <- coef <- fit$coef - + h.au <- c(1, -1); h.bp <- c(1, 1) z.au <- drop(h.au %*% coef); z.bp <- drop(h.bp %*% coef) @@ -380,7 +334,7 @@ msfit <- function(bp, r, nboot) { a$pchi <- pchisq(a$rss, lower.tail=FALSE, df=a$df) } else a$pchi <- 1.0 - + return(a) } @@ -388,13 +342,13 @@ plot.msfit <- function(x, curve=TRUE, main=NULL, sub=NULL, xlab=NULL, ylab=NULL, { if(is.null(main)) main="Curve fitting for multiscale bootstrap resampling" if(is.null(sub)) - { - sub <- paste("AU = ", round(x$p["au"], digits=2), - ", BP = ", round(x$p["bp"], digits=2), - ", v = ", round(x$coef["v"], digits=2), - ", c = ", round(x$coef["c"], digits=2), - ", pchi = ", round(x$pchi, digits=2)) - } + { + sub <- paste("AU = ", round(x$p["au"], digits=2), + ", BP = ", round(x$p["bp"], digits=2), + ", v = ", round(x$coef["v"], digits=2), + ", c = ", round(x$coef["c"], digits=2), + ", pchi = ", round(x$pchi, digits=2)) + } if(is.null(xlab)) xlab=expression(sqrt(r)) if(is.null(ylab)) ylab=expression(z-value) @@ -418,16 +372,16 @@ lines.msfit <- function(x, col=2, lty=1, ...) { summary.msfit <- function(object, digits=3, ...) { cat("\nResult of curve fitting for multiscale bootstrap resampling:\n\n") - + cat("Estimated p-values:\n") pv <- data.frame(object$p, object$se) names(pv) <- c("Estimate", "Std. Error"); row.names(pv) <- c("au", "bp") print(pv, digits=digits); cat("\n") - + cat("Estimated coefficients:\n") coef <- object$coef print(coef, digits=digits); cat("\n") - + cat(paste("Residual sum of squares: ", round(object$rss,digits=digits)), ", p-value: ", round(object$pchi, digits=digits), " on ", object$df, " DF\n\n", sep="") @@ -435,22 +389,22 @@ summary.msfit <- function(object, digits=3, ...) { seplot <- function(object, type=c("au", "bp"), identify=FALSE, main=NULL, xlab=NULL, ylab=NULL, ...) - { - if(!is.na(pm <- pmatch(type[1], c("au", "bp")))) { - wh <- c("au", "bp")[pm] - - if(is.null(main)) - main <- "p-value vs standard error plot" - if(is.null(xlab)) - xlab <- c("AU p-value", "BP value")[pm] - if(is.null(ylab)) - ylab <- "Standard Error" - - plot(object$edges[,wh], object$edges[,paste("se", wh, sep=".")], - main=main, xlab=xlab, ylab=ylab, ...) - if(identify) - identify(x=object$edges[,wh], y=object$edges[,paste("se", wh, sep=".")], - labels=row.names(object$edges)) - } - else stop("'type' should be \"au\" or \"bp\".") +{ + if(!is.na(pm <- pmatch(type[1], c("au", "bp")))) { + wh <- c("au", "bp")[pm] + + if(is.null(main)) + main <- "p-value vs standard error plot" + if(is.null(xlab)) + xlab <- c("AU p-value", "BP value")[pm] + if(is.null(ylab)) + ylab <- "Standard Error" + + plot(object$edges[,wh], object$edges[,paste("se", wh, sep=".")], + main=main, xlab=xlab, ylab=ylab, ...) + if(identify) + identify(x=object$edges[,wh], y=object$edges[,paste("se", wh, sep=".")], + labels=row.names(object$edges)) } + else stop("'type' should be \"au\" or \"bp\".") +} diff --git a/man/pvclust.Rd b/man/pvclust.Rd index a71d830..fb6ac76 100644 --- a/man/pvclust.Rd +++ b/man/pvclust.Rd @@ -10,26 +10,29 @@ \usage{ pvclust(data, method.hclust="average", method.dist="correlation", use.cor="pairwise.complete.obs", - nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE) + nboot=1000, parallel=FALSE, r=seq(.5,1.4,by=.1), + store=FALSE, weight=FALSE, iseed=NULL, quiet=FALSE) parPvclust(cl=NULL, data, method.hclust="average", method.dist="correlation", use.cor="pairwise.complete.obs", nboot=1000, r=seq(.5,1.4,by=.1), store=FALSE, weight=FALSE, - init.rand=TRUE, seed=NULL, iseed=NULL) + init.rand=NULL, iseed=NULL, quiet=FALSE) } \arguments{ \item{data}{numeric data matrix or data frame.} \item{method.hclust}{ the agglomerative method used in hierarchical clustering. This - should be (an abbreviation of) one of \code{"average"}, \code{"ward"}, - \code{"single"}, \code{"complete"}, \code{"mcquitty"}, + should be (an abbreviation of) one of \code{"average"}, \code{"ward.D"}, + \code{"ward.D2"}, \code{"single"}, \code{"complete"}, \code{"mcquitty"}, \code{"median"} or \code{"centroid"}. The default is \code{"average"}. See \code{method} argument in \code{\link[stats]{hclust}}. } - \item{method.dist}{the distance measure to be used. This should be (an - abbreviation of) one of \code{"correlation"}, \code{"uncentered"}, + \item{method.dist}{the distance measure to be used. This should be + a character string, or a function which returns a \code{dist} object. + A character string should be (an abbreviation of) + one of \code{"correlation"}, \code{"uncentered"}, \code{"abscor"} or those which are allowed for \code{method} argument in \code{dist} function. The default is \code{"correlation"}. See \emph{details} section in this help and @@ -43,6 +46,15 @@ parPvclust(cl=NULL, data, method.hclust="average", } \item{nboot}{the number of bootstrap replications. The default is \code{1000}.} + \item{parallel}{switch for parallel computation. + If \code{FALSE} the computation is done in non-parallel mode. + If \code{TRUE} or a positive integer is supplied, + parallel computation is done with automatically generated PSOCK cluster. + Use \code{TRUE} for default cluster size (\code{parallel::detectCores() - 1}), + or specify the size by an integer. + If a \code{cluster} object is supplied the cluster is used for parallel computation. + Note that \code{NULL} is currently not allowed for using the default cluster. + } \item{r}{numeric vector which specifies the relative sample sizes of bootstrap replications. For original sample size \eqn{n} and bootstrap sample size \eqn{n'}, this is defined as \eqn{r=n'/n}.} @@ -61,11 +73,12 @@ parPvclust(cl=NULL, data, method.hclust="average", % length as \code{cl}. If \code{NULL} is specified, % \code{1:length(cl)} is used as seed vector. The default is \code{NULL}.} \item{init.rand}{logical. If \code{init.rand=TRUE}, random number generators are initialized. - Use \code{iseed} argument to achieve reproducible results.} - \item{seed}{integer vector of random seeds. It should have the same - length as \code{cl}. \strong{This argument is duplicated and will be unavailable in the future. Consider using \code{iseed} instead.} } - \item{iseed}{an integer to be passed to \code{clusterSetRNGStream} when \code{init.rand=TRUE}.} - + Use \code{iseed} argument to achieve reproducible results. \strong{This argument is duplicated and will be unavailable in the future.}} +% \item{seed}{integer vector of random seeds. It should have the same +% length as \code{cl}. \strong{This argument is duplicated and will be unavailable in the future. Consider using \code{iseed} instead.} } + \item{iseed}{An integer. If non-\code{NULL} value is supplied random number generators are initialized. + It is passed to \code{set.seed} or \code{clusterSetRNGStream}.} + \item{quiet}{logical. If \code{TRUE} it does not report the progress.} } \details{ Function \code{pvclust} conducts multiscale bootstrap resampling to calculate @@ -144,6 +157,10 @@ parPvclust(cl=NULL, data, method.hclust="average", \code{\link{text.pvclust}}, \code{\link{pvrect}} and \code{\link{pvpick}}.} \references{ + Suzuki, R. and Shimodaira, H. (2006) + "Pvclust: an R package for assessing the uncertainty in hierarchical clustering", + \emph{Bioinformatics}, 22 (12): 1540-1542. + Shimodaira, H. (2004) "Approximately unbiased tests of regions using multistep-multiscale bootstrap resampling", @@ -159,15 +176,14 @@ parPvclust(cl=NULL, data, method.hclust="average", \emph{The Fifteenth International Conference on Genome Informatics 2004}, P034. - \url{http://www.is.titech.ac.jp/~shimo/prog/pvclust/} + \url{http://www.sigmath.es.osaka-u.ac.jp/shimo-lab/prog/pvclust/} } \examples{ -## using Boston data in package MASS -library(MASS) -data(Boston) +### example using Boston data in package MASS +data(Boston, package = "MASS") -## multiscale bootstrap resampling -boston.pv <- pvclust(Boston, nboot=100) +## multiscale bootstrap resampling (non-parallel) +boston.pv <- pvclust(Boston, nboot=100, parallel=FALSE) ## CAUTION: nboot=100 may be too small for actual use. ## We suggest nboot=1000 or larger. @@ -190,19 +206,32 @@ msplot(boston.pv, edges=c(2,4,6,7)) par(ask=ask.bak) -## Print clusters with high p-values +## print clusters with high p-values boston.pp <- pvpick(boston.pv) boston.pp -\dontrun{ -## parallel computation -library(parallel) -cl <- makeCluster(2, type = "PSOCK") +### Using a custom distance measure + +## Define a distance function which returns an object of class "dist". +## The function must have only one argument "x" (data matrix or data.frame). +cosine <- function(x) { + x <- as.matrix(x) + y <- t(x) \%*\% x + res <- 1 - y / (sqrt(diag(y)) \%*\% t(sqrt(diag(y)))) + res <- as.dist(res) + attr(res, "method") <- "cosine" + return(res) +} + +result <- pvclust(Boston, method.dist=cosine, nboot=100) +plot(result) -## parallel version of pvclust -boston.pv <- parPvclust(cl, Boston, nboot=1000) -stopCluster(cl) +\dontrun{ +### parallel computation +result.par <- pvclust(Boston, nboot=1000, parallel=TRUE) +plot(result.par) } + } \author{Ryota Suzuki \email{[email protected]}} \keyword{cluster} -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/pvclust.git _______________________________________________ debian-med-commit mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit
