On Thu, May 3, 2012 at 12:19 PM, Brian G. Peterson <br...@braverock.com> wrote: > On Thu, 2012-05-03 at 12:09 -0500, Paul Johnson wrote: >> Greetings: >> >> On Thu, May 3, 2012 at 11:36 AM, Brian G. Peterson <br...@braverock.com> >> wrote: >> > On Thu, 2012-05-03 at 10:51 -0500, Paul Johnson wrote: >> >> If somebody in R Core would like this and think about putting it, or >> >> something like it, into the base, then many chores involving predicted >> >> values would become much easier. >> >> >> > Why does this need to be in base? Implement it in a package. >> >>
So, nobody agrees with me that R base should have model.data? Too bad! You could save us a lot of effort. I've found different efforts to get the same work done in termplot and all of the implementations of predict. And just about every regression related package has its own approach. If there were one good way that would always work, just think how convenient it would be for all those function & package writers. Oh, well. I'm not saying that my model.data is perfect, just that I wish there were a perfect one :) Yesterday, I realized that predict.nls probably has to deal with this problem so I studied that and have yet another version of model.data to propose to you. I'm using this in my regression-support package rockchalk, so you don't need to give me Brian Peterson's advice to "put it in a package". The idea here is to take variables from a fitted model's data if it can find them, and then grab variables from the parent environment IF they have the correct length. This means we ignore variables like d in poly(x, d) because the variable d is not of the same length as the variables in the model. ##' Creates a "raw" (UNTRANSFORMED) data frame equivalent ##' to the input data that would be required to fit the given model. ##' ##' Unlike model.frame and model.matrix, this does not return transformed ##' variables. ##' ##' @param model A fitted regression model in which the data argument ##' is specified. This function will fail if the model was not fit ##' with the data option. ##' @return A data frame ##' @export ##' @author Paul E. Johnson <pauljohn@@ku.edu> ##' @example inst/examples/model.data-ex.R model.data <- function(model){ #from nls, returns -1 for missing variables lenVar <- function(var, data) tryCatch(length(eval(as.name(var), data, env)), error = function(e) -1) fmla <- formula(model) varNames <- all.vars(fmla) ## all variable names ## varNames includes d in poly(x,d), possibly other "constants" ## varNamesRHS <- all.vars(formula(delete.response(terms(model)))) ## previous same as nls way? fmla2 <- fmla fmla2[[2L]] <- 0 varNamesRHS <- all.vars(fmla2) varNamesLHS <- setdiff(varNames, varNamesRHS) env <- environment(fmla) if (is.null(env)) env <- parent.frame() dataOrig <- eval(model$call$data, environment(formula(model))) rndataOrig <- row.names(dataOrig) n <- sapply(varNames, lenVar, data=dataOrig) targetLength <- length(eval(as.name(varNamesLHS[1]), dataOrig, env)) varNames <- varNames[ n == targetLength ] ldata <- lapply(varNames, function(x) eval(as.name(x), dataOrig, env)) names(ldata) <- varNames data <- data.frame(ldata[varNames]) if (!is.null(rndataOrig)) row.names(data) <- rndataOrig ## remove rows listed in model's na.action ## TODO: question: what else besides OMIT might be called for? if ( !is.null(model$na.action)){ data <- data[ -as.vector(model$na.action), ] } ## keep varNamesRHS that exist in datOrig attr(data, "varNamesRHS") <- setdiff(colnames(data), varNamesLHS) invisible(data) } And some example output: > ## check if model.data works when there is no data argument > set.seed(12345) > x1 <- rpois(100, l=6) > x2 <- rnorm(100, m=50, s=10) > x3 <- rnorm(100) > y <- rnorm(100) > m0 <- lm(y ~ x1 + x2 + x3) > m0.data <- model.data(m0) > x1[4:5] <- NA > m0 <- lm(y ~ x1 + x2 + x3) > m0.data <- model.data(m0) > head(m0.data) y x1 x2 x3 1 -0.8086741 7 44.59614 -1.6193283 2 1.0011198 9 69.47693 0.5483979 3 0.4560525 8 50.53590 0.1952822 6 0.6417692 4 52.77954 -0.2509466 7 -0.4150210 5 56.91171 1.6993467 8 -0.4595757 6 58.23795 -0.3442988 > x1 <- rpois(100, l=6) > x2 <- rnorm(100, m=50, s=10) > x3 <- rnorm(100) > xcat1 <- gl(2,50, labels=c("M","F")) > xcat2 <- cut(rnorm(100), breaks=c(-Inf, 0, 0.4, 0.9, 1, Inf), labels=c("R", > "M", "D", "P", "G")) > dat <- data.frame(x1, x2, x3, xcat1, xcat2) > rm(x1, x2, x3, xcat1, xcat2) > xcat1n <- with(dat, contrasts(xcat1)[xcat1, ,drop=FALSE]) > xcat2n <- with(dat, contrasts(xcat2)[xcat2, ]) > STDE <- 20 > dat$y <- 0.03 + 0.8*dat$x1 + 0.1*dat$x2 + 0.7*dat$x3 + xcat1n %*% c(2) + > xcat2n %*% c(0.1,-2,0.3, 0.1) + STDE*rnorm(100) > rownames(dat$y) <- NULL > m1 <- lm(y ~ poly(x1, 2), data=dat) > m1.data <- model.data(m1) > head(m1.data) y x1 1 56.336279 7 2 47.823205 5 3 9.296108 6 4 16.213508 5 5 -16.922331 3 6 10.639724 7 > attr(m1.data, "varNamesRHS") [1] "x1" > d <- 2 > m2 <- lm(y ~ poly(x1, d), data=dat) > m2.data <- model.data(m2) > head(m2.data) y x1 1 56.336279 7 2 47.823205 5 3 9.296108 6 4 16.213508 5 5 -16.922331 3 6 10.639724 7 > attr(m2.data, "varNamesRHS") [1] "x1" > m3 <- lm(y ~ log(10 + x1) + poly(x1, d) + sin(x2), data=dat) > m3.data <- model.data(m3) > head(m3.data) y x1 x2 1 56.336279 7 56.27965 2 47.823205 5 50.02144 3 9.296108 6 52.84378 4 16.213508 5 39.98221 5 -16.922331 3 43.82778 6 10.639724 7 58.28194 > attr(m3.data, "varNamesRHS") [1] "x1" "x2" > m4 <- lm(log(50+y) ~ log(d+10+x1) + poly(x1, 2), data=dat) > m4.data <- model.data(m4) > head(m4.data) y x1 1 56.336279 7 2 47.823205 5 3 9.296108 6 4 16.213508 5 5 -16.922331 3 6 10.639724 7 > attr(m4.data, "varNamesRHS") [1] "x1" > m4 <- lm(y ~ x1*x1, data=dat) > m4.data <- model.data(m4) > head(m4.data) y x1 1 56.336279 7 2 47.823205 5 3 9.296108 6 4 16.213508 5 5 -16.922331 3 6 10.639724 7 > attr(m4.data, "varNamesRHS") [1] "x1" > m4 <- lm(y ~ x1 + I(x1^2), data=dat) > m4.data <- model.data(m4) > head(m4.data) y x1 1 56.336279 7 2 47.823205 5 3 9.296108 6 4 16.213508 5 5 -16.922331 3 6 10.639724 7 > attr(m4.data, "varNamesRHS") [1] "x1" > dat$x1[sample(100, 5)] <- NA > dat$y[sample(100, 5)] <- NA > dat$x2[sample(100, 5)] <- NA > dat$x3[sample(100,10)] <- NA > m1 <- lm(y ~ log(10 + x1), data=dat) > m1.data <- model.data(m1) > head(m1.data) y x1 1 56.336279 7 2 47.823205 5 3 9.296108 6 4 16.213508 5 5 -16.922331 3 6 10.639724 7 > summarize(m1.data) $numerics x1 y 0% 1.000 -31.9800 25% 5.000 0.9636 50% 6.000 13.8400 75% 7.000 27.5300 100% 12.000 71.1100 mean 5.933 14.8400 sd 2.336 20.4300 var 5.456 417.3000 NA's 0.000 0.0000 N 90.000 90.0000 $factors NULL > attr(m1.data, "varNamesRHS") [1] "x1" > m2 <- lm(y ~ log(x1 + 10), data=dat) > m1.data <- model.data(m1) > head(m1.data) y x1 1 56.336279 7 2 47.823205 5 3 9.296108 6 4 16.213508 5 5 -16.922331 3 6 10.639724 7 > summarize(m1.data) $numerics x1 y 0% 1.000 -31.9800 25% 5.000 0.9636 50% 6.000 13.8400 75% 7.000 27.5300 100% 12.000 71.1100 mean 5.933 14.8400 sd 2.336 20.4300 var 5.456 417.3000 NA's 0.000 0.0000 N 90.000 90.0000 $factors NULL > attr(m1.data, "varNamesRHS") [1] "x1" > d <- 2 > m3 <- lm(log(50+y) ~ log(d+10+x1) + x2 + sin(x3), data=dat) > m3.data <- model.data(m3) > head(m3.data) y x1 x2 x3 2 47.823205 5 50.02144 -2.4669386 3 9.296108 6 52.84378 0.4847158 4 16.213508 5 39.98221 -0.9379723 5 -16.922331 3 43.82778 3.3307333 7 26.084587 3 49.15181 0.2204558 8 4.392061 8 45.65280 0.8762108 > summarize(m3.data) $numerics x1 x2 x3 y 0% 1.000 27.630 -2.55600 -31.9800 25% 4.000 42.370 -0.52690 0.7905 50% 6.000 49.150 0.04162 11.5800 75% 7.000 54.390 0.71310 26.8900 100% 12.000 72.390 3.33100 71.1100 mean 5.883 48.800 0.02186 13.2300 sd 2.362 8.847 1.08900 20.0400 var 5.578 78.270 1.18700 401.8000 NA's 0.000 0.000 0.00000 0.0000 N 77.000 77.000 77.00000 77.0000 $factors NULL > attr(m3.data, "varNamesRHS") [1] "x1" "x2" "x3" > m4 <- lm(y ~ x1^3 + log(x2), data=dat) > m4.data <- model.data(m4) > summarize(m4.data) $numerics x1 x2 y 0% 1.000 27.630 -31.9800 25% 4.000 42.630 0.8032 50% 6.000 49.710 12.2500 75% 7.000 54.910 27.2300 100% 12.000 72.390 71.1100 mean 5.918 49.220 14.0800 sd 2.372 8.808 20.0000 var 5.624 77.590 399.9000 NA's 0.000 0.000 0.0000 N 85.000 85.000 85.0000 $factors NULL > attr(m4.data, "varNamesRHS") [1] "x1" "x2" > m5 <- lm(y ~ x1 + I(x1^2) + cos(x2), data=dat) > m5.data <- model.data(m5) > head(m5.data) y x1 x2 1 56.336279 7 56.27965 2 47.823205 5 50.02144 3 9.296108 6 52.84378 4 16.213508 5 39.98221 5 -16.922331 3 43.82778 7 26.084587 3 49.15181 > summarize(m5.data) $numerics x1 x2 y 0% 1.000 27.630 -31.9800 25% 4.000 42.630 0.8032 50% 6.000 49.710 12.2500 75% 7.000 54.910 27.2300 100% 12.000 72.390 71.1100 mean 5.918 49.220 14.0800 sd 2.372 8.808 20.0000 var 5.624 77.590 399.9000 NA's 0.000 0.000 0.0000 N 85.000 85.000 85.0000 $factors NULL > attr(m5.data, "varNamesRHS") [1] "x1" "x2" > x10 <- rnorm(100) > x11 <- rnorm(100) > m6 <- lm(y ~ x1 + I(x1^2) + cos(x2) + log(10 + x10) + sin(x11) + x10*x11, > data=dat) > m6.data <- model.data(m6) > head(m6.data) y x1 x2 x10 x11 1 56.336279 7 56.27965 -0.4562360 -0.27005578 2 47.823205 5 50.02144 -1.4031737 -1.11355194 3 9.296108 6 52.84378 -0.2816067 -0.08319405 4 16.213508 5 39.98221 0.3353308 0.17284191 5 -16.922331 3 43.82778 -0.5184262 0.61754378 7 26.084587 3 49.15181 0.9037821 0.02170007 > dim(m6.data) [1] 85 5 > summarize(m5.data) $numerics x1 x2 y 0% 1.000 27.630 -31.9800 25% 4.000 42.630 0.8032 50% 6.000 49.710 12.2500 75% 7.000 54.910 27.2300 100% 12.000 72.390 71.1100 mean 5.918 49.220 14.0800 sd 2.372 8.808 20.0000 var 5.624 77.590 399.9000 NA's 0.000 0.000 0.0000 N 85.000 85.000 85.0000 $factors NULL > attr(m6.data, "varNamesRHS") [1] "x1" "x2" "x10" "x11" > -- Paul E. Johnson Professor, Political Science Assoc. Director 1541 Lilac Lane, Room 504 Center for Research Methods University of Kansas University of Kansas http://pj.freefaculty.org http://quant.ku.edu ______________________________________________ R-devel@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-devel