Using the caret and xgboost packages and this ( https://www.kaggle.com/code/pelkoja/visual-xgboost-tuning-with-caret) tutorial, I am running an XGBoost regression (XGBR) and I want to extract the residuals of the XGBR. I hyper-tuned the model using the caret package and then, using the 'best' model parameters I used the xgboost package to perform the regression.
My dataset has the ntl, pop, tirs, agbh variables stored in data.frame (ntl is the dependent variable while the other three are the independent). Assuming that my XGBR model is called m and my data.frame is called block.data, I did: library(caret) library(terra) library(xgboost) library(doParallel) library(dplyr) library(ggplot2) library(glue) library(ModelMetrics) library(readr) wd = "path/" block.data = read.csv(paste0(wd, "block.data.csv")) block.data = subset(block.data, select = c(ntl, tirs, pop, agbh)) set.seed(1123) samp <- sample(nrow(block.data), 0.80 * nrow(block.data)) train <- block.data[samp, ] train_x <- as.matrix(select(train, -ntl)) train_y <- train$ntl test <- block.data[-samp, ] test_x <- select(test, -ntl) test_y <- test$ntl no_cores <- detectCores() - 1 cl = makePSOCKcluster(no_cores) registerDoParallel(cl) # default model grid_default <- expand.grid( nrounds = 100, max_depth = 6, eta = 0.3, gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 ) train_control <- caret::trainControl( method = "none", verboseIter = FALSE, # no training log allowParallel = TRUE # FALSE for reproducible results ) xgb_base <- caret::train( x = train_x, y = train_y, trControl = train_control, tuneGrid = grid_default, method = "xgbTree", verbose = TRUE ) # hyperparameter tuning # setting up the maximum number of trees nrounds <- 1000 # note to start nrounds from 200, as smaller learning rates result in errors so # big with lower starting points that they'll mess the scales tune_grid <- expand.grid( nrounds = seq(from = 200, to = nrounds, by = 50), eta = c(0.025, 0.05, 0.1, 0.3), max_depth = c(2, 3, 4, 5, 6), gamma = 0, colsample_bytree = 1, min_child_weight = 1, subsample = 1 ) tune_control <- caret::trainControl( method = "cv", # cross-validation number = 3, # with n folds #index = createFolds(tr_treated$Id_clean), # fix the folds verboseIter = FALSE, # no training log allowParallel = TRUE # FALSE for reproducible results ) xgb_tune <- caret::train( x = train_x, y = train_y, trControl = tune_control, tuneGrid = tune_grid, method = "xgbTree", verbose = TRUE ) tuneplot <- function(x, probs = .90) { ggplot(x) + coord_cartesian(ylim = c(quantile(x$results$RMSE, probs = probs), min(x$results$RMSE))) + theme_bw() } tuneplot(xgb_tune) xgb_tune$bestTune # find maximum depth tune_grid2 <- expand.grid( nrounds = seq(from = 50, to = nrounds, by = 50), eta = xgb_tune$bestTune$eta, max_depth = ifelse(xgb_tune$bestTune$max_depth == 2, c(xgb_tune$bestTune$max_depth:4), xgb_tune$bestTune$max_depth - 1:xgb_tune$bestTune$max_depth + 1), gamma = 0, colsample_bytree = 1, min_child_weight = c(1, 2, 3), subsample = 1 ) xgb_tune2 <- caret::train( x = train_x, y = train_y, trControl = tune_control, tuneGrid = tune_grid2, method = "xgbTree", verbose = TRUE ) tuneplot(xgb_tune2) xgb_tune2$bestTune # different values for row and column sampling tune_grid3 <- expand.grid( nrounds = seq(from = 50, to = nrounds, by = 50), eta = xgb_tune$bestTune$eta, max_depth = xgb_tune2$bestTune$max_depth, gamma = 0, colsample_bytree = c(0.4, 0.6, 0.8, 1.0), min_child_weight = xgb_tune2$bestTune$min_child_weight, subsample = c(0.5, 0.75, 1.0) ) xgb_tune3 <- caret::train( x = train_x, y = train_y, trControl = tune_control, tuneGrid = tune_grid3, method = "xgbTree", verbose = TRUE ) tuneplot(xgb_tune3, probs = .95) xgb_tune3$bestTune set.seed(57) omp_set_num_threads(2) # caret parallel processing threads # gamma tune_grid4 <- expand.grid( nrounds = seq(from = 50, to = nrounds, by = 50), eta = xgb_tune$bestTune$eta, max_depth = xgb_tune2$bestTune$max_depth, gamma = c(0, 0.05, 0.1, 0.5, 0.7, 0.9, 1.0), colsample_bytree = xgb_tune3$bestTune$colsample_bytree, min_child_weight = xgb_tune2$bestTune$min_child_weight, subsample = xgb_tune3$bestTune$subsample ) xgb_tune4 <- caret::train( x = train_x, y = train_y, trControl = tune_control, tuneGrid = tune_grid4, method = "xgbTree", verbose = TRUE ) tuneplot(xgb_tune4) xgb_tune4$bestTune # Reducing the Learning Rate tune_grid5 <- expand.grid( nrounds = seq(from = 100, to = 10000, by = 100), eta = c(0.01, 0.015, 0.025, 0.05, 0.1), max_depth = xgb_tune2$bestTune$max_depth, gamma = xgb_tune4$bestTune$gamma, colsample_bytree = xgb_tune3$bestTune$colsample_bytree, min_child_weight = xgb_tune2$bestTune$min_child_weight, subsample = xgb_tune3$bestTune$subsample ) xgb_tune5 <- caret::train( x = train_x, y = train_y, trControl = tune_control, tuneGrid = tune_grid5, method = "xgbTree", verbose = TRUE ) tuneplot(xgb_tune5) xgb_tune5$bestTune # Fitting the Model (final_grid <- expand.grid( nrounds = xgb_tune5$bestTune$nrounds, eta = xgb_tune5$bestTune$eta, max_depth = xgb_tune5$bestTune$max_depth, gamma = xgb_tune5$bestTune$gamma, colsample_bytree = xgb_tune5$bestTune$colsample_bytree, min_child_weight = xgb_tune5$bestTune$min_child_weight, subsample = xgb_tune5$bestTune$subsample )) (xgb_model <- caret::train( x = train_x, y = train_y, trControl = train_control, tuneGrid = final_grid, method = "xgbTree", verbose = TRUE )) stopCluster(cl) # apply model to the whole data set using xgboost xgb_m <- xgb.DMatrix(data = data.matrix(block.data), label = block.data$ntl) m = xgb.train(data = xgb_m, max.depth = xgb_tune5$bestTune$max_depth, # watchlist = watchlist, nrounds = xgb_tune5$bestTune$nrounds, min_child_weight = xgb_tune5$bestTune$min_child_weight, subsample = xgb_tune5$bestTune$subsample, eta = xgb_tune5$bestTune$eta, gamma = xgb_tune5$bestTune$gamma, colsample_bytree = xgb_tune5$bestTune$colsample_bytree, objective = "reg:squarederror") m # export xgb residuals xgb_resids = predict(m, xgb_m, na.rm = TRUE) sb = c(ntl, pop_res, tirs_res, agbh_res) xgb_resids = sb$ntl - xgb_resids plot(xgb_resids) # the plot is raster stack with many layers Obviously, I am doing something very wrong. How can I export the residuals of an XGBR as a single raster? Here is a very small sample of my dataset: block.data = structure(list(x = c(11880750L, 11879250L, 11879750L, 11880250L, 11880750L, 11881250L, 11879250L, 11879750L, 11880250L, 11880750L, 11881250L, 11878750L, 11879250L, 11879750L, 11880250L, 11880750L, 11881250L, 11879250L, 11879750L, 11880250L, 11880750L, 11881250L, 11881750L, 11882250L, 11879250L, 11879750L, 11880250L, 11880750L, 11881250L, 11881750L, 11882250L, 11882750L, 11879250L, 11879750L ), y = c(1802250L, 1801750L, 1801750L, 1801750L, 1801750L, 1801750L, 1801250L, 1801250L, 1801250L, 1801250L, 1801250L, 1800750L, 1800750L, 1800750L, 1800750L, 1800750L, 1800750L, 1800250L, 1800250L, 1800250L, 1800250L, 1800250L, 1800250L, 1800250L, 1799750L, 1799750L, 1799750L, 1799750L, 1799750L, 1799750L, 1799750L, 1799750L, 1799250L, 1799250L ), ntl = c(18.7969169616699, 25.7222957611084, 23.4188251495361, 25.4322757720947, 16.4593601226807, 12.7868213653564, 30.9337253570557, 29.865758895874, 30.4080600738525, 29.5479888916016, 24.3493347167969, 35.2427635192871, 38.989933013916, 34.6536979675293, 29.4607238769531, 30.7469024658203, 34.3946380615234, 42.8660278320312, 34.7930717468262, 30.9516315460205, 32.20654296875, 39.999755859375, 46.6002235412598, 38.6480979919434, 60.5214920043945, 33.1799964904785, 31.8498134613037, 30.9209423065186, 32.2269744873047, 53.7062034606934, 45.5225944519043, 38.3570976257324, 123.040382385254, 73.0528182983398), pop = c(19.6407718658447, 610.009216308594, 654.812622070312, 426.475830078125, 66.3839492797852, 10.6471328735352, 443.848846435547, 602.677429199219, 488.478454589844, 387.470947265625, 58.2341117858887, 413.888488769531, 315.057678222656, 354.082946777344, 602.827758789062, 463.518829345703, 296.713928222656, 923.920593261719, 434.436645507812, 799.562927246094, 404.709564208984, 265.043304443359, 366.697235107422, 399.851684570312, 952.2314453125, 870.356994628906, 673.406616210938, 493.521606445312, 273.841888427734, 371.428619384766, 383.057830810547, 320.986755371094, 991.131225585938, 1148.87768554688), tirs = c(39.7242431640625, 44.9583969116211, 41.4048385620117, 42.6056709289551, 40.0976028442383, 38.7490005493164, 44.2747650146484, 43.5645370483398, 41.6180191040039, 40.3799781799316, 38.8664817810059, 44.9089202880859, 44.414306640625, 44.560977935791, 43.1288986206055, 40.9315185546875, 38.8918418884277, 46.3063850402832, 45.5805702209473, 44.9196586608887, 42.2495613098145, 39.3051452636719, 38.7914810180664, 38.6069412231445, 44.6782455444336, 46.4024772644043, 44.4720573425293, 41.7361183166504, 42.3378067016602, 41.0018348693848, 39.3579216003418, 41.6303863525391, 43.8207550048828, 46.0460357666016 ), agbh = c(3.32185006141663, 4.98925733566284, 4.35699367523193, 4.94798421859741, 3.14325952529907, 2.93211793899536, 4.52736520767212, 4.99723243713379, 5.13944292068481, 3.92965626716614, 3.43465113639832, 3.55617475509644, 3.4659411907196, 5.24469566345215, 5.36995029449463, 4.61549234390259, 4.82002925872803, 4.20452928543091, 4.71502685546875, 5.20452785491943, 5.05676746368408, 5.9952244758606, 6.16778612136841, 4.69053316116333, 2.62325501441956, 4.74775457382202, 4.93133020401001, 5.02366256713867, 5.74016952514648, 6.28353786468506, 4.67424774169922, 4.56812858581543, 1.88153350353241, 4.31531000137329)), class = "data.frame", row.names = c(NA, -34L)) My raster layer: r = new("RasterBrick", file = new(".RasterFile", name = "", datanotation = "FLT4S", byteorder = "little", nodatavalue = -Inf, NAchanged = FALSE, nbands = 1L, bandorder = "BIL", offset = 0L, toptobottom = TRUE, blockrows = 0L, blockcols = 0L, driver = "", open = FALSE), data = new(".MultipleRasterData", values = structure(c(NA, NA, NA, NA, 18.7969169616699, NA, NA, NA, NA, NA, 25.7222957611084, 23.4188251495361, 25.4322757720947, 16.4593601226807, 12.7868213653564, NA, NA, NA, NA, 30.9337253570557, 29.865758895874, 30.4080600738525, 29.5479888916016, 24.3493347167969, NA, NA, NA, 35.2427635192871, 38.989933013916, 34.6536979675293, 29.4607238769531, 30.7469024658203, 34.3946380615234, NA, NA, NA, NA, 42.8660278320312, 34.7930717468262, 30.9516315460205, 32.20654296875, 39.999755859375, 46.6002235412598, 38.6480979919434, NA, NA, 60.5214920043945, 33.1799964904785, 31.8498134613037, 30.9209423065186, 32.2269744873047, 53.7062034606934, 45.5225944519043, 38.3570976257324, NA, 123.040382385254, 73.0528182983398, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 19.6407718658447, NA, NA, NA, NA, NA, 610.009216308594, 654.812622070312, 426.475830078125, 66.3839492797852, 10.6471328735352, NA, NA, NA, NA, 443.848846435547, 602.677429199219, 488.478454589844, 387.470947265625, 58.2341117858887, NA, NA, NA, 413.888488769531, 315.057678222656, 354.082946777344, 602.827758789062, 463.518829345703, 296.713928222656, NA, NA, NA, NA, 923.920593261719, 434.436645507812, 799.562927246094, 404.709564208984, 265.043304443359, 366.697235107422, 399.851684570312, NA, NA, 952.2314453125, 870.356994628906, 673.406616210938, 493.521606445312, 273.841888427734, 371.428619384766, 383.057830810547, 320.986755371094, NA, 991.131225585938, 1148.87768554688, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 39.7242431640625, NA, NA, NA, NA, NA, 44.9583969116211, 41.4048385620117, 42.6056709289551, 40.0976028442383, 38.7490005493164, NA, NA, NA, NA, 44.2747650146484, 43.5645370483398, 41.6180191040039, 40.3799781799316, 38.8664817810059, NA, NA, NA, 44.9089202880859, 44.414306640625, 44.560977935791, 43.1288986206055, 40.9315185546875, 38.8918418884277, NA, NA, NA, NA, 46.3063850402832, 45.5805702209473, 44.9196586608887, 42.2495613098145, 39.3051452636719, 38.7914810180664, 38.6069412231445, NA, NA, 44.6782455444336, 46.4024772644043, 44.4720573425293, 41.7361183166504, 42.3378067016602, 41.0018348693848, 39.3579216003418, 41.6303863525391, NA, 43.8207550048828, 46.0460357666016, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3.32185006141663, NA, NA, NA, NA, NA, 4.98925733566284, 4.35699367523193, 4.94798421859741, 3.14325952529907, 2.93211793899536, NA, NA, NA, NA, 4.52736520767212, 4.99723243713379, 5.13944292068481, 3.92965626716614, 3.43465113639832, NA, NA, NA, 3.55617475509644, 3.4659411907196, 5.24469566345215, 5.36995029449463, 4.61549234390259, 4.82002925872803, NA, NA, NA, NA, 4.20452928543091, 4.71502685546875, 5.20452785491943, 5.05676746368408, 5.9952244758606, 6.16778612136841, 4.69053316116333, NA, NA, 2.62325501441956, 4.74775457382202, 4.93133020401001, 5.02366256713867, 5.74016952514648, 6.28353786468506, 4.67424774169922, 4.56812858581543, NA, 1.88153350353241, 4.31531000137329, NA, NA, NA, NA, NA, NA), .Dim = c(63L, 4L)), offset = 0, gain = 1, inmemory = TRUE, fromdisk = FALSE, nlayers = 4L, dropped = NULL, isfactor = c(FALSE, FALSE, FALSE, FALSE), attributes = list(), haveminmax = TRUE, min = c(12.7868213653564, 10.6471328735352, 38.6069412231445, 1.88153350353241), max = c(123.040382385254, 1148.87768554688, 46.4024772644043, 6.28353786468506), unit = "", names = c("ntl", "pop", "tirs", "agbh")), legend = new(".RasterLegend", type = character(0), values = logical(0), color = logical(0), names = logical(0), colortable = logical(0)), title = character(0), extent = new("Extent", xmin = 11878500, xmax = 11883000, ymin = 1799000, ymax = 1802500), rotated = FALSE, rotation = new(".Rotation", geotrans = numeric(0), transfun = function () NULL), ncols = 9L, nrows = 7L, crs = new("CRS", projargs = NA_character_), srs = character(0), history = list(), z = list()) -- Tziokas Nikolaos Cartographer Tel:(+44)07561120302 LinkedIn <http://linkedin.com/in/nikolaos-tziokas-896081130> [[alternative HTML version deleted]] _______________________________________________ R-sig-Geo mailing list R-sig-Geo@r-project.org https://stat.ethz.ch/mailman/listinfo/r-sig-geo