Hi All,

I tried to use xgboost to model and predict count data. The predictions are 
however not as expected as shown below.
# sponge count data in library(spm)
    library(spm)
data(sponge)
data(sponge.grid)
names(sponge)
[1] "easting"  "northing" "sponge"   "tpi3"     "var7"     "entro7"   "bs34"    
 "bs11"
names(sponge.grid)
[1] "easting"  "northing" "tpi3"     "var7"     "entro7"   "bs34"     "bs11"
    range(sponge[, c(3)])
[1]  1 39 # count sample data

# the expected predictions are:
set.seed(1234)
gbmpred1 <- gbmpred(sponge[, -c(3)], sponge[, 3], sponge.grid[, c(1:2)], 
sponge.grid, family = "poisson", n.cores=2)
range(gbmpred1$Predictions)
[1] 10.04643 31.39230 # the expected predictions

# Here are results from xgboost
# use count:poisson
library(xgboost)
    xgbst2.1 <- xgboost(data = as.matrix(sponge[, -c(3)]), label = sponge[, 3], 
max_depth = 2, eta = 0.001, nthread = 6, nrounds = 3000, objective = 
"count:poisson")
    xgbstpred2 <- predict(xgbst2.1, as.matrix(sponge.grid))
head(xgbstpred2)
range(xgbstpred2)
[1] 1.109032 4.083049 # much lower than expected
    table(xgbstpred2)
                1.10903215408325 1.26556181907654   3.578040599823 
4.08304929733276  # only four predictions, why?
                36535             2714            40930            15351

   plot(gbmpred1$Predictions, xgbstpred2)

   # use reg:linear
    xgbst2.2 <- xgboost(data = as.matrix(sponge[, -c(3)]), label = sponge[, 3], 
max_depth = 2, eta = 0.001, nthread = 6, nrounds = 3000, objective = 
"reg:linear")
    xgbstpred2.2 <- predict(xgbst2.2, as.matrix(sponge.grid))
    head(xgbstpred2.2)
    table(xgbstpred2.2)
    range( xgbstpred2.2)
[1]  9.019174 23.060669 # this is much closer to but still lower than what 
expected

   plot(gbmpred1$Predictions, xgbstpred2.2)

# use count:poisson and subsample = 0.5
set.seed(1234)
    param <- list(max_depth = 2, eta = 0.001, gamma = 0.001, subsample = 0.5, 
silent = 1, nthread = 6, objective = "count:poisson")
    xgbst2.4 <- xgboost(data = as.matrix(sponge[, -c(3)]), label = sponge[, 3], 
params = param, nrounds = 3000)
    xgbstpred2.4 <- predict(xgbst2.4, as.matrix(sponge.grid))
    head(xgbstpred2.4)
    table(xgbstpred2.4)
    range(xgbstpred2.4)
[1] 1.188561 3.986767 # this is much lower than what expected

   plot(gbmpred1$Predictions, xgbstpred2.4)
  plot(xgbstpred2.2, xgbstpred2.4)

All these were run in R 3.3.3 on Windows"
> Sys.info()
                     sysname                      release
                   "Windows"                      "7 x64"
                     version
"build 7601, Service Pack 1"
                     machine
                    "x86-64"

Have I miss-specified or missed some parameters? Or there is a bug in xgboost. 
I am grateful for any help.

Kind regards,
Jin

Jin Li, PhD | Spatial Modeller / Computational Statistician
National Earth and Marine Observations | Environmental Geoscience Division
t:  +61 2 6249 9899    www.ga.gov.au<http://www.ga.gov.au/>

Geoscience Australia Disclaimer: This e-mail (and files transmitted with it) is 
intended only for the person or entity to which it is addressed. If you are not 
the intended recipient, then you have received this e-mail by mistake and any 
use, dissemination, forwarding, printing or copying of this e-mail and its file 
attachments is prohibited. The security of emails transmitted cannot be 
guaranteed; by forwarding or replying to this email, you acknowledge and accept 
these risks.
-------------------------------------------------------------------------------------------------------------------------


        [[alternative HTML version deleted]]

_______________________________________________
R-sig-Geo mailing list
R-sig-Geo@r-project.org
https://stat.ethz.ch/mailman/listinfo/r-sig-geo

Reply via email to