Dear friends! Hope all of you are fine. I am doing work on decision trees. I have made following program to construct classification trees. It works but some times it gives an error message e.g. "negative extents to matrix". I tried to solve the issue but could not. Now, I need help, how to avoid this message? Thanks
#################################### ds=iris; dl=nrow(ds) # ds: dataset, dl: data length c1=ds[,1]; c2=ds[,2]; c3=ds[,3]; c4=ds[,4]; c5=ds[,5]; id=c(1:dl); iris=cbind(c1,c2,c3,c4,c5) tec1NF = 0; tec2NF = 0; lmsum = 0; indTrainlist = list(); indTestlist = list() options(digits=6) CTPrundUnprund=function(dataset,samplesize,noofsamples,noofpredictors,response,method,nc,nlvls,nnodes) { vlf1=list(); vlf2=array(0,dim=c(noofsamples,nlvls,nnodes)); cl=list(); uncl=array(0,dim=c(noofsamples,nlvls,1)); vvlf1=list(); nodeclas=c(); tdi=c(); eTraining=c(); nodeside=c(); unclp=array(0,dim=c(nlvls,1)); vvlf2=array(0,dim=c(noofsamples,nlvls,nnodes)); nodeclasp=c(); nodesidep=c() for (j in 1:noofsamples) { groupindex = 1:nrow(dataset) index = sample(groupindex,samplesize, replace=TRUE) indTrain=unique(sort(index)); indTest=groupindex[-indTrain]; m=dataset[indTrain,]; lm <- nrow(m); lmsum <<- lmsum + lm; #w=dataset[indTest,]; tdi[j]=list(indTrain); indTrainlist[j] = list(indTrain); indTestlist[j] = list(indTest) mcat=table(m[,response]) mcat=data.frame(mcat) # or mcat=list(mcat) mcat=mcat[,1] mcat=matrix(mcat) mcat=as.real(mcat) print(j) d1=0 sv=function(dataset) { gnew=c(); ggini=c(); gtwoing=c(); gentropy=c(); val=c(); mGnewLr=c(); mGginiLr=c(); mGtwoingLr=c(); mGentropyLr=c(); sPjnew=c(); spLnew=c(); spRnew=c(); matrGnew=c(); matrGgini=c(); matrGtwoing=c(); matrGentropy=c(); sPjgini=c(); spLgini=c(); spRgini=c(); sAbsDiff=c(); sPj2=c(); spL2=c(); spR2=c(); sPj=c(); spL=c(); spR=c(); logPj=c(); logpR=c(); logpL=c(); variables=c() m=dataset mcat=table(m[,response]) mcat=data.frame(mcat) # or mcat=list(mcat) mcat=mcat[,1] mcat=matrix(mcat) mcat=as.real(mcat) Pj=table(m[,response])/nrow(m) Pj # Left Node for (variable in 1:noofpredictors) { gnew=NULL; ggini=NULL; gtwoing=NULL; gentropy=NULL; val=NULL varSort=sort(unique(dataset[,variable])) #varSort1=sort(unique(m[,variable])); minval <- min(varSort1); maxval <- max(varSort1); r <- maxval-minval; K <- round(1+(3.3*log(nrow(m),10)),digits=0) #; H <- r/K; varSort <- seq(minval, maxval,H) L=length(varSort) for (k in 1 : (L-1)) { val[k]=varSort[k] x=m[m[,variable]<=val[k]] # Obtained set of data with all columns which satisfies the condition lenx=length(x) # Length of x lenx=lenx/ncol(m) # xl/no of columns x_left=matrix(x,lenx) # matrix(x,xr) p1=lenx/nrow(m) y=x_left[,response] y1=table(y) feqy1=matrix(y1) y1=data.frame(y1) y1=y1[,1] y1=matrix(y1) y1=as.real(y1) z=match(mcat,y1) y1=y1[z] y1[is.na(y1)]=0 feqy1=feqy1[z] feqy1[is.na(feqy1)]=0 cbind(y1, feqy1) pL=feqy1/length(x_left[,response]) pL # Right Node X=m[m[,variable]>val[k]] # Obtained set of data with all columns which satisfies the condition lenX=length(X) # Length of x lenX=lenX/ncol(m) # xl/no of columns x_right=matrix(X,lenX) # matrix(x,xr) p2=lenX/nrow(m) Y=x_right[,response] Y1=table(Y) feqy2=matrix(Y1) Y1=data.frame(Y1) Y1=Y1[,1] Y1=matrix(Y1) Y1=as.real(Y1) Z=match(mcat,Y1) Y1=Y1[Z] Y1[is.na(Y1)]=0 feqy2=feqy2[Z] feqy2[is.na(feqy2)]=0 cbind(Y1, feqy2) pR=feqy2/length(x_right[,response]) pR for ( i in 1: length(Pj)) { sPjnew[i]=Pj[i]*exp(Pj[i]); spRnew[i]=pR[i]*exp(pR[i]); spLnew[i]=pL[i]*exp(pL[i]) # New method sPjgini[i]=(Pj[i]^2); spRgini[i]=(pR[i]^2); spLgini[i]=(pL[i]^2) # Gini method sAbsDiff[i]=abs(pL[i]-pR[i]) # Twoing method if (Pj[i]!=0) # Entropy method { logPj[i]=log(Pj[i]) } else { logPj[i]= 0 } if (pR[i]!=0) { logpR[i]=log(pR[i]) } else { logpR[i]= 0 } if (pL[i]!=0) { logpL[i]=log(pL[i]) } else { logpL[i]= 0 } sPj2[i]=(Pj[i]^2); spR2[i]=(pR[i]^2); spL2[i]=(pL[i]^2); sPj[i]=-(Pj[i]*logPj[i]); spL[i]=-(pL[i]*logpL[i]); spR[i]=-(pR[i]*logpR[i]) } G1=exp(-1)*(p1*sum(spLnew)+p2*sum(spRnew)-sum(sPjnew)); G2=p1*sum(spLgini)+p2*sum(spRgini)-sum(sPjgini) G3=0.25*p1*p2*(sum(sAbsDiff))^2; G4=sum(sPj)-(p1*sum(spL))-(p2*sum(spR)) gnew[k]=G1; ggini[k]=G2; gtwoing[k]=G3; gentropy[k]=G4 } gnew; ggini; gtwoing; gentropy; val ind1=sort.list(gnew); ind2=sort.list(ggini); ind3=sort.list(gtwoing); ind4=sort.list(gentropy) gnew=gnew[ind1]; ggini=ggini[ind2]; gtwoing=gtwoing[ind3]; gentropy=gentropy[ind4] valGnew=val[ind1]; valGgini=val[ind2]; valGtwoing=val[ind3]; valGentropy=val[ind4] cmGnew=cbind(valGnew,gnew); cmGgini=cbind(valGgini,ggini); cmGtwoing=cbind(valGtwoing,gtwoing); cmGentropy=cbind(valGentropy,gentropy) mGnew=matrix(cmGnew,length(gnew)); mGgini=matrix(cmGgini,length(ggini)); mGtwoing=matrix(cmGtwoing,length(gtwoing)); mGentropy=matrix(cmGentropy,length(gentropy)) mGnewLr[variable]=list(mGnew[length(gnew),]); mGginiLr[variable]=list(mGgini[length(ggini),]); mGtwoingLr[variable]=list(mGtwoing[length(gtwoing),]); mGentropyLr[variable]=list(mGentropy[length(gentropy),]) } cbnew=do.call('cbind',mGnewLr); cbgini=do.call('cbind',mGginiLr); cbtwoing=do.call('cbind',mGtwoingLr); cbentropy=do.call('cbind',mGentropyLr) for(i in 1:noofpredictors){variables[i]=i} cb11=cbnew[1,]; cb12=cbnew[2,]; cb21=cbgini[1,]; cb22=cbgini[2,]; cb31=cbtwoing[1,]; cb32=cbtwoing[2,]; cb41=cbentropy[1,]; cb42=cbentropy[2,] indexGnew=sort.list(cb12); indexGgini=sort.list(cb22); indexGtwoing=sort.list(cb32); indexGentropy=sort.list(cb42) cb11=cb11[indexGnew]; cb12=cb12[indexGnew] cb21=cb21[indexGgini]; cb22=cb22[indexGgini] cb31=cb31[indexGtwoing]; cb32=cb32[indexGtwoing] cb41=cb41[indexGentropy]; cb42=cb42[indexGentropy] varGnew=variables[indexGnew]; varGgini=variables[indexGgini]; varGtwoing=variables[indexGtwoing]; varGentropy=variables[indexGentropy] cbGnew=cbind(varGnew,cb11,cb12) cbGgini=cbind(varGgini,cb21,cb22) cbGtwoing=cbind(varGtwoing,cb31,cb32) cbGentropy=cbind(varGentropy,cb41,cb42) mGnew=matrix(cbGnew,length(cb12)) mGgini=matrix(cbGgini,length(cb22)) mGtwoing=matrix(cbGtwoing,length(cb32)) mGentropy=matrix(cbGentropy,length(cb42)) matrGnew=mGnew[length(cb12),] matrGgini=mGgini[length(cb22),] matrGtwoing=mGtwoing[length(cb32),] matrGentropy=mGentropy[length(cb42),] variableimportance=list(matrGnew,matrGgini,matrGtwoing,matrGentropy) variableimportance } #sv(m) # Passing parameters LN=function(dataset,variable,value) { x_left=c() m=dataset x=m[m[,variable]<=value] # Obtained set of data with all columns which satisfies the condition lenx=length(x) # Length of x lenx=lenx/ncol(m) # xl/no of columns x_left=matrix(x,lenx) # matrix(x,xr) return(x_left) } RN=function(dataset,variable,value) { x_right=c() m=dataset X=m[m[,variable]>value] # Obtained set of data with all columns which satisfies the condition lengX=length(X) # Length of x lengX=lengX/ncol(m) # xl/no of columns x_right=matrix(X,lengX) # Function "SpVal" ends return(x_right) } e=0; n <- 0; vr<-c(); vv<-c(); vl=NULL; vl=array(0,dim=c(nc,nlvls,nnodes)); vvl=NULL; vvl=array(0,dim=c(nc,nlvls,nnodes)); vlf=NULL; vlf=list(); vvlf=NULL; vvlf=list(); trec1N = 0 func=function(data,level) { var1=c() f=table(data[,response]); wf=as.real(names(which.max(f))); trecN <- nrow(data)-max(table(data[,response])); for(q in 1:response){var1[q]=var(data[,q])}; var1[is.na(var1)]=0; if (any(var1==0) || nrow(data)<=5) { n <<- n + 1 if (level > 0){ for (a in 1 : level) { vl[wf,a,n] <<- vr[a]; vvl[wf,a,n] <<- vv[a] # vl=variable level, vvl=variable value level }} trec1N <<- trec1N + trecN return() }else { data1 = data; s=sv(data1)[[method]]; s1 <- s[1]; s2=s[2]; level <- level + 1 vr[level] <<- s1; vv[level] <<- s2 # vv=variable value data=LN(data1,s1,s2) func(data,level) data=RN(data1,s1,s2) func(data,level) } } func(m,0) # Passing parameters eTraining[j] <- trec1N cl[[j]]=apply(vl,3,function(vl) which(rowSums(vl)!=0))[1:n]; for(i in 1:n){uncl[j,i,1]=unlist(cl[[j]][i])} for(i in 1:nnodes){for(ii in 1:nc){if(any(vl[,,i][ii,]!=0)){vlf[[i]] <- vl[,,i][ii,]; vvlf[[i]] <- vvl[,,i][ii,]}}}; vlf1[[j]] <- do.call('rbind',vlf); vvlf1[[j]] <- do.call('rbind',vvlf) }; junk <- sapply(vlf1,function(i) !is.null(i)) l=vlf1[junk] # l for variable and ll for value selection x <- sapply(1:length(l), function(x) { sum(sapply(l, function(y) { if ( nrow(l[[x]]) != nrow(y) | ncol(l[[x]]) != ncol(y) ) FALSE else sum(y != l[[x]]) == 0 })) } ); varseq.max <- max(x) wxmax <- which(x==max(x)); lwxmax_var <- length(wxmax); ll=vvlf1[junk]; llfresh <- ll[wxmax]; lfresh <- l[wxmax] # l for value and ll for variable selection x <- sapply(1:length(llfresh), function(x) { sum(sapply(llfresh, function(y) { if ( nrow(llfresh[[x]]) != nrow(y) | ncol(llfresh[[x]]) != ncol(y) ) FALSE else sum(y != llfresh[[x]]) == 0 })) } ); valseq.max <- max(x) wxmax1 <- which(x==max(x)); lwxmax_val <- length(wxmax1); bestsamples <- wxmax[wxmax1]; eTrainbest <- eTraining[bestsamples]; weTrainmin <- which(eTrainbest == min(eTrainbest)); stepwisemax <- cbind(varseq.max,valseq.max); lweTrain_min <- length(weTrainmin); lastweTrainmin <- weTrainmin[lweTrain_min]; bestsamplesf <- bestsamples[weTrainmin]; lngth <- length(bestsamplesf); finalsample <- bestsamplesf[lngth]; mm <- dataset[tdi[[finalsample]],]; l1 <- l[[finalsample]]; indxl1 <- colSums(l1)!=0; var <- matrix(l1[,indxl1],nrow=nrow(l1)); allmaxfreq <- cbind(lwxmax_var,lwxmax_val,lweTrain_min); ll2 <- ll[[finalsample]]; indxll2 <- colSums(ll2)!=0; val <- round(matrix(ll2[,indxll2],nrow=nrow(ll2)),digits=4); mat <- as.data.frame(matrix(paste(var,"(",val,")",sep=""),nrow=nrow(var))); print("Sample number having max freq"); print(wxmax); print("Samples index having max value-wise freq from above samples"); print(wxmax1); print("length of max and same frequencies"); print(allmaxfreq); print("step-wise most repeated variable-value sequence"); print(stepwisemax); print("Sample index with min error"); print(weTrainmin); print("final sample"); print(finalsample); print("Each row of matrix is terminal node, L for left and R for right node") indTrainfinal = indTrainlist[[finalsample]]; indTestfinal = indTestlist[[finalsample]] mm = dataset[indTrainfinal,]; ww = dataset[indTestfinal,] #print(indTrainfinal); print(indTestfinal) e=0; n <- 0; avelength <- lmsum/noofsamples; units <- c(); misc.units <- c(); deviance <- c() func=function(data,testdata,level) { var1=c(); #print(cbind(ncol(testdata),nrow(testdata))) y=table(testdata[,response]); pp=y/nrow(testdata) f=matrix(y); pp=matrix(pp) Y=as.real(names(y)) z=match(mcat,Y); Y=Y[z]; Y[is.na(Y)]=0 f=f[z]; f[is.na(f)]=0; sumf <- sum(f) pp=pp[z]; pp[is.na(pp)]=0 devN=f*log(pp); devN[is.na(devN)]=0; dTest=(-2)*sum(matrix(devN)); fr=table(data[,response]); wf=as.real(names(which.max(fr))); tecN <- nrow(testdata)-max(table(testdata[,response])); for(q in 1:response){var1[q]=var(data[,q])}; var1[is.na(var1)]=0 if (any(var1==0) || nrow(data)<=5) { n <<- n + 1; nodeclas[n] <<- wf; nodeside[n] <<- side tec1NF <<- tec1NF + tecN; units[n] <<- sumf; misc.units[n] <<- tecN; deviance[n] <<- dTest return() }else { data1 = data; data2 = testdata; s=sv(data1)[[method]]; s1 <- s[1]; s2=s[2]; level <- level + 1 data=LN(data1,s1,s2); testdata=LN(data2,s1,s2); side <<- c("L") func(data,testdata,level) data=RN(data1,s1,s2); testdata=RN(data2,s1,s2); side <<- c("R") func(data,testdata,level) } } func(mm,ww,0) deviance <- round(deviance,digits=2); LorRnode <- nodeside; class <- nodeclas resMat <- cbind(mat,LorRnode,units,misc.units,deviance,class); print("un-pruned tree"); print(resMat) eTrain <- round(mean(eTraining)/avelength*100,digits=2); eTest <- round(tec1NF/nrow(ww)*100,digits=2); dev.sum <- sum(deviance); print(cbind(eTrain,eTest,dev.sum)) n <- 0; vr1<-c(); vv1<-c(); vlp=array(0,dim=c(nc,nlvls,nnodes)); vvlp=array(0,dim=c(nc,nlvls,nnodes)); avelength <- lmsum/noofsamples; unit <- c(); misc.unit <- c(); deviancep <- c(); wf1 <- 0.1; wf2 <- 0.2; vlfp=list(); vvlfp=list(); #esum <- c("esum") funcp=function(data,testdata,levelp) { var1=c(); var1l=c(); var1r=c() y=table(testdata[,response]); pp=y/nrow(testdata) f=matrix(y); pp=matrix(pp) Y=as.real(names(y)) z=match(mcat,Y); Y=Y[z]; Y[is.na(Y)]=0 f=f[z]; f[is.na(f)]=0; sumf <- sum(f) pp=pp[z]; pp[is.na(pp)]=0 devN=f*log(pp); devN[is.na(devN)]=0; dTest=(-2)*sum(matrix(devN)); fr=table(data[,response]); wf=as.real(names(which.max(fr))); tecN <- nrow(testdata)-max(table(testdata[,response])); e <- nrow(data)-max(table(data[,response])) for(q in 1:response){var1[q]=var(data[,q])}; var1[is.na(var1)]=0 if(nrow(data) > 5 && all(var1 != 0)){ss=sv(data)[[method]]; ss1=ss[1]; ss2=ss[2]; dat=LN(data,ss1,ss2); n1=nrow(dat); for(q1 in 1:response){var1l[q1]=var(dat[,q1])}; var1l[is.na(var1l)]=0; e1 <- nrow(dat)-max(table(dat[,response])); dat1=RN(data,ss1,ss2); n2=nrow(dat1); for(q2 in 1:response){var1r[q2]=var(dat1[,q2])}; var1r[is.na(var1r)]=0; e2 <- nrow(dat1)-max(table(dat1[,response])); f1=table(dat[,response]); f2=table(dat1[,response]); wf1=as.real(names(which.max(f1))); wf2=as.real(names(which.max(f2)))}; #esum <- sum(e1,e2) if (any(var1==0) || wf1==wf2 || nrow(data)<=5) # || esum==e { n <<- n + 1; nodeclasp[n] <<- wf; nodesidep[n] <<- side if (levelp > 0){ for (b in 1 : levelp) { vlp[wf,b,n] <<- vr1[b]; vvlp[wf,b,n] <<- vv1[b] # vl=variable level, vvl=variable value level }} tec2NF <<- tec2NF + tecN; unit[n] <<- sumf; misc.unit[n] <<- tecN; deviancep[n] <<- dTest return() }else { data1 = data; data2 = testdata; s=sv(data1)[[method]]; s1 <- s[1]; s2=s[2]; levelp <- levelp + 1 vr1[levelp] <<- s1; vv1[levelp] <<- s2 # vv=variable value data=LN(data1,s1,s2); testdata=LN(data2,s1,s2); side <<- c("L") funcp(data,testdata,levelp) data=RN(data1,s1,s2); testdata=RN(data2,s1,s2); side <<- c("R") funcp(data,testdata,levelp) } } funcp(mm,ww,0) clp=apply(vlp,3,function(vlp) which(rowSums(vlp)!=0))[1:n]; for(i in 1:nnodes){for(ii in 1:nc){if(any(vlp[,,i][ii,]!=0)){vlfp[[i]] <- vlp[,,i][ii,]; vvlfp[[i]] <- vvlp[,,i][ii,]}}}; vlf1p <- do.call('rbind',vlfp); vvlf1p <- do.call('rbind',vvlfp); lp=vlf1p; indlp <- colSums(lp) != 0; lp=matrix(lp[,indlp],nrow=nrow(lp)); llp=vvlf1p; indllp <- colSums(llp) != 0; llp=round(matrix(llp[,indllp],nrow=nrow(llp)),digits=4); print("pruned tree"); matp <- as.data.frame(matrix(paste(lp,"(",llp,")",sep=""),nrow=nrow(lp))); deviance <- round(deviancep,digits=2); LorRnode <- nodesidep; class <- nodeclasp resMatp <- cbind(matp,LorRnode,unit,misc.unit,deviance,class); print(resMatp) eTrain <- round(mean(eTraining)/avelength*100,digits=2); eTest <- round(tec2NF/nrow(ww)*100,digits=2); dev.sum <- sum(deviance); print(cbind(eTrain,eTest,dev.sum)) } CTPrundUnprund(iris,150,1,4,5,1,3,10,10) best regards Muhammad Azam [[alternative HTML version deleted]]
______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.