Dear friends!
Hope all of you are fine. I am doing work on decision trees. I have made 
following program to construct classification trees. It works but some times it 
gives an error message e.g. "negative extents to matrix". I tried to solve the 
issue but could not. Now, I need help, how to avoid this message? Thanks

####################################
ds=iris; dl=nrow(ds)   # ds: dataset, dl: data length
c1=ds[,1]; c2=ds[,2]; c3=ds[,3]; c4=ds[,4]; c5=ds[,5]; id=c(1:dl); 
iris=cbind(c1,c2,c3,c4,c5)
tec1NF = 0; tec2NF = 0; lmsum = 0; indTrainlist = list(); indTestlist = list()
options(digits=6)
 
CTPrundUnprund=function(dataset,samplesize,noofsamples,noofpredictors,response,method,nc,nlvls,nnodes)
 {
vlf1=list(); vlf2=array(0,dim=c(noofsamples,nlvls,nnodes)); cl=list(); 
uncl=array(0,dim=c(noofsamples,nlvls,1)); vvlf1=list(); nodeclas=c(); tdi=c(); 
eTraining=c(); nodeside=c(); unclp=array(0,dim=c(nlvls,1)); 
vvlf2=array(0,dim=c(noofsamples,nlvls,nnodes)); nodeclasp=c(); nodesidep=c()
 
   for (j in 1:noofsamples) 
   {
     groupindex = 1:nrow(dataset)
     index = sample(groupindex,samplesize, replace=TRUE)
     indTrain=unique(sort(index)); indTest=groupindex[-indTrain]; 
m=dataset[indTrain,]; lm <- nrow(m); lmsum <<- lmsum + lm; 
#w=dataset[indTest,]; 
     tdi[j]=list(indTrain); indTrainlist[j] = list(indTrain); indTestlist[j] =
 list(indTest)
 mcat=table(m[,response])
 mcat=data.frame(mcat)     # or  mcat=list(mcat)
 mcat=mcat[,1]
 mcat=matrix(mcat)
 mcat=as.real(mcat)
 print(j)
  d1=0
  sv=function(dataset)
  {
 gnew=c(); ggini=c(); gtwoing=c(); gentropy=c(); val=c(); mGnewLr=c(); 
mGginiLr=c(); mGtwoingLr=c(); mGentropyLr=c(); sPjnew=c(); spLnew=c(); 
spRnew=c(); matrGnew=c(); matrGgini=c(); matrGtwoing=c(); matrGentropy=c(); 
sPjgini=c(); spLgini=c(); spRgini=c(); sAbsDiff=c(); sPj2=c(); spL2=c(); 
spR2=c(); sPj=c(); spL=c(); spR=c(); logPj=c(); logpR=c(); logpL=c(); 
variables=c()
 m=dataset
 mcat=table(m[,response])
 mcat=data.frame(mcat)     # or 
 mcat=list(mcat)
 mcat=mcat[,1]
 mcat=matrix(mcat)
 mcat=as.real(mcat)
 
 Pj=table(m[,response])/nrow(m)
 Pj
 # Left Node
   for (variable in 1:noofpredictors)
   {
     gnew=NULL; ggini=NULL; gtwoing=NULL; gentropy=NULL; val=NULL 
 varSort=sort(unique(dataset[,variable]))
#varSort1=sort(unique(m[,variable])); minval <- min(varSort1); maxval <- 
max(varSort1); r <- maxval-minval; K <- round(1+(3.3*log(nrow(m),10)),digits=0)
#; H <- r/K; varSort <- seq(minval, maxval,H)
 L=length(varSort)
 for (k in 1 : (L-1))
 {
  val[k]=varSort[k]
  x=m[m[,variable]<=val[k]]   # Obtained set of data with all columns which 
satisfies the condition
  lenx=length(x)    # Length of
 x
  lenx=lenx/ncol(m)    # xl/no of columns
  x_left=matrix(x,lenx)   # matrix(x,xr)
  p1=lenx/nrow(m)
  y=x_left[,response]
  y1=table(y)
  feqy1=matrix(y1)
  y1=data.frame(y1)
  y1=y1[,1]
  y1=matrix(y1)
  y1=as.real(y1)
  z=match(mcat,y1)
  y1=y1[z]
  y1[is.na(y1)]=0
  feqy1=feqy1[z]
  feqy1[is.na(feqy1)]=0
  cbind(y1, feqy1)
  pL=feqy1/length(x_left[,response])
  pL
  
  # Right Node
  X=m[m[,variable]>val[k]]  # Obtained set of data with all columns which 
satisfies the condition
  lenX=length(X)    # Length of x
  lenX=lenX/ncol(m)   # xl/no of
 columns
  x_right=matrix(X,lenX)    # matrix(x,xr)
  p2=lenX/nrow(m)
  Y=x_right[,response]
  Y1=table(Y)
  feqy2=matrix(Y1)
  Y1=data.frame(Y1)
  Y1=Y1[,1]
  Y1=matrix(Y1)
  Y1=as.real(Y1)
  Z=match(mcat,Y1)
  Y1=Y1[Z]
  Y1[is.na(Y1)]=0
  feqy2=feqy2[Z]
  feqy2[is.na(feqy2)]=0
  cbind(Y1, feqy2)
  pR=feqy2/length(x_right[,response])
  pR
 
  for ( i in 1: length(Pj))
  {
   sPjnew[i]=Pj[i]*exp(Pj[i]); spRnew[i]=pR[i]*exp(pR[i]); 
spLnew[i]=pL[i]*exp(pL[i]) # New method
   sPjgini[i]=(Pj[i]^2); spRgini[i]=(pR[i]^2); spLgini[i]=(pL[i]^2)   # Gini 
method
   sAbsDiff[i]=abs(pL[i]-pR[i])        # Twoing method 
   if (Pj[i]!=0)          # Entropy method
   {           
    logPj[i]=log(Pj[i])
   } 
    else
    { logPj[i]= 0
    }
     if (pR[i]!=0)
     {
      logpR[i]=log(pR[i])
     }
     else
     { logpR[i]= 0
     } 
    if
 (pL[i]!=0)
    {
     logpL[i]=log(pL[i])
    }
    else
   { 
    logpL[i]= 0
   }
  sPj2[i]=(Pj[i]^2); spR2[i]=(pR[i]^2); spL2[i]=(pL[i]^2); 
sPj[i]=-(Pj[i]*logPj[i]); spL[i]=-(pL[i]*logpL[i]); spR[i]=-(pR[i]*logpR[i])
  }
  G1=exp(-1)*(p1*sum(spLnew)+p2*sum(spRnew)-sum(sPjnew)); 
G2=p1*sum(spLgini)+p2*sum(spRgini)-sum(sPjgini)
  G3=0.25*p1*p2*(sum(sAbsDiff))^2; G4=sum(sPj)-(p1*sum(spL))-(p2*sum(spR)) 
 
  gnew[k]=G1; ggini[k]=G2; gtwoing[k]=G3; gentropy[k]=G4         
 
 }
 gnew; ggini; gtwoing; gentropy; val
 
 ind1=sort.list(gnew); ind2=sort.list(ggini); ind3=sort.list(gtwoing);
 ind4=sort.list(gentropy) 
 gnew=gnew[ind1]; ggini=ggini[ind2]; gtwoing=gtwoing[ind3]; 
gentropy=gentropy[ind4]
 valGnew=val[ind1]; valGgini=val[ind2]; valGtwoing=val[ind3]; 
valGentropy=val[ind4]
  
 cmGnew=cbind(valGnew,gnew); cmGgini=cbind(valGgini,ggini); 
cmGtwoing=cbind(valGtwoing,gtwoing); cmGentropy=cbind(valGentropy,gentropy)
 mGnew=matrix(cmGnew,length(gnew)); mGgini=matrix(cmGgini,length(ggini)); 
mGtwoing=matrix(cmGtwoing,length(gtwoing));       
mGentropy=matrix(cmGentropy,length(gentropy))
 
 mGnewLr[variable]=list(mGnew[length(gnew),]); 
mGginiLr[variable]=list(mGgini[length(ggini),]);          
mGtwoingLr[variable]=list(mGtwoing[length(gtwoing),]); 
mGentropyLr[variable]=list(mGentropy[length(gentropy),])     
   }
 cbnew=do.call('cbind',mGnewLr);
 cbgini=do.call('cbind',mGginiLr); cbtwoing=do.call('cbind',mGtwoingLr); 
cbentropy=do.call('cbind',mGentropyLr)
 
 for(i in 1:noofpredictors){variables[i]=i}
 
 cb11=cbnew[1,]; cb12=cbnew[2,]; cb21=cbgini[1,]; cb22=cbgini[2,]; 
cb31=cbtwoing[1,]; cb32=cbtwoing[2,]; cb41=cbentropy[1,]; cb42=cbentropy[2,]
     
 indexGnew=sort.list(cb12); indexGgini=sort.list(cb22); 
indexGtwoing=sort.list(cb32); indexGentropy=sort.list(cb42)
 cb11=cb11[indexGnew]; cb12=cb12[indexGnew]
 cb21=cb21[indexGgini]; cb22=cb22[indexGgini]
 cb31=cb31[indexGtwoing]; cb32=cb32[indexGtwoing]
 cb41=cb41[indexGentropy]; cb42=cb42[indexGentropy]
 varGnew=variables[indexGnew]; varGgini=variables[indexGgini]; 
varGtwoing=variables[indexGtwoing]; varGentropy=variables[indexGentropy] 
 cbGnew=cbind(varGnew,cb11,cb12)
 cbGgini=cbind(varGgini,cb21,cb22)
 cbGtwoing=cbind(varGtwoing,cb31,cb32)
 cbGentropy=cbind(varGentropy,cb41,cb42)
 mGnew=matrix(cbGnew,length(cb12))
 mGgini=matrix(cbGgini,length(cb22))
 mGtwoing=matrix(cbGtwoing,length(cb32))
 mGentropy=matrix(cbGentropy,length(cb42))
 matrGnew=mGnew[length(cb12),]
 matrGgini=mGgini[length(cb22),]
 matrGtwoing=mGtwoing[length(cb32),]
 matrGentropy=mGentropy[length(cb42),]
 variableimportance=list(matrGnew,matrGgini,matrGtwoing,matrGentropy)
 variableimportance  
}    
 #sv(m)      # Passing parameters
   LN=function(dataset,variable,value)
   { 
       
 x_left=c()
 m=dataset
 x=m[m[,variable]<=value]     # Obtained set of data with all columns which 
satisfies the condition
 lenx=length(x)     # Length of x
 lenx=lenx/ncol(m)    # xl/no of columns
 x_left=matrix(x,lenx)    # matrix(x,xr)
 return(x_left) 
   }
   RN=function(dataset,variable,value)
   { 
 x_right=c()
 m=dataset
 X=m[m[,variable]>value]    # Obtained set of data with all columns which 
satisfies the condition
 lengX=length(X)     # Length of x
 lengX=lengX/ncol(m)    # xl/no of columns
 x_right=matrix(X,lengX)    # Function "SpVal" ends
 return(x_right) 
   }
e=0; n <- 0; vr<-c(); vv<-c(); vl=NULL; vl=array(0,dim=c(nc,nlvls,nnodes)); 
vvl=NULL; vvl=array(0,dim=c(nc,nlvls,nnodes)); vlf=NULL; vlf=list(); vvlf=NULL; 
vvlf=list(); trec1N = 0
    func=function(data,level) 
    {
 var1=c()
 f=table(data[,response]); wf=as.real(names(which.max(f))); trecN <- 
nrow(data)-max(table(data[,response]));
 for(q in 1:response){var1[q]=var(data[,q])}; var1[is.na(var1)]=0; 
 if (any(var1==0) || nrow(data)<=5)
 {
                n <<- n + 1
  if (level > 0){ for (a in 1 : level)
  {
  vl[wf,a,n] <<- vr[a]; vvl[wf,a,n] <<- vv[a]  # vl=variable level, 
vvl=variable value level
  }}
  trec1N <<- trec1N + trecN 
  
   return() 
 }else 
  {  
   data1 = data; s=sv(data1)[[method]]; s1 <- s[1]; s2=s[2]; level <- level + 1
      
   vr[level] <<- s1; vv[level] <<- s2 # vv=variable value
   data=LN(data1,s1,s2)    
    func(data,level) 
   data=RN(data1,s1,s2)
   func(data,level)
  } 
    } 
  func(m,0)   # Passing parameters
eTraining[j] <- trec1N  
cl[[j]]=apply(vl,3,function(vl) which(rowSums(vl)!=0))[1:n]; for(i in 
1:n){uncl[j,i,1]=unlist(cl[[j]][i])}
for(i in 1:nnodes){for(ii in 1:nc){if(any(vl[,,i][ii,]!=0)){vlf[[i]] <- 
vl[,,i][ii,]; vvlf[[i]] <- vvl[,,i][ii,]}}}; vlf1[[j]] <- do.call('rbind',vlf); 
vvlf1[[j]] <-
 do.call('rbind',vvlf)
   }; junk <- sapply(vlf1,function(i) !is.null(i))
l=vlf1[junk]  # l for variable and ll for value selection
x <- sapply(1:length(l), function(x) {
  sum(sapply(l, function(y) {
    if ( nrow(l[[x]]) != nrow(y) | ncol(l[[x]]) != ncol(y) ) FALSE
    else sum(y != l[[x]]) == 0
  }))
} ); varseq.max <- max(x) 
wxmax <- which(x==max(x)); lwxmax_var <- length(wxmax); 
ll=vvlf1[junk]; llfresh <- ll[wxmax]; lfresh <- l[wxmax] 
# l for value and ll for variable selection
x <- sapply(1:length(llfresh), function(x) {
  sum(sapply(llfresh, function(y) {
    if ( nrow(llfresh[[x]]) != nrow(y) | ncol(llfresh[[x]]) != ncol(y) ) FALSE
    else sum(y != llfresh[[x]]) == 0
  }))
} ); valseq.max <- max(x)
wxmax1 <- which(x==max(x)); lwxmax_val <- length(wxmax1); bestsamples <- 
wxmax[wxmax1]; eTrainbest <- eTraining[bestsamples]; 
weTrainmin <- which(eTrainbest == min(eTrainbest)); stepwisemax <- 
cbind(varseq.max,valseq.max); lweTrain_min <- length(weTrainmin); 
lastweTrainmin <- weTrainmin[lweTrain_min]; bestsamplesf <- 
bestsamples[weTrainmin]; lngth <- length(bestsamplesf); finalsample <- 
bestsamplesf[lngth]; 
mm <- dataset[tdi[[finalsample]],]; l1 <- l[[finalsample]]; indxl1 <- 
colSums(l1)!=0; var <- matrix(l1[,indxl1],nrow=nrow(l1)); 
allmaxfreq <- cbind(lwxmax_var,lwxmax_val,lweTrain_min); ll2 <- 
ll[[finalsample]]; indxll2 <- colSums(ll2)!=0; 
val <- round(matrix(ll2[,indxll2],nrow=nrow(ll2)),digits=4); 
mat <- as.data.frame(matrix(paste(var,"(",val,")",sep=""),nrow=nrow(var))); 
print("Sample number having max freq"); print(wxmax); print("Samples index 
having max value-wise freq from above samples"); print(wxmax1); 
print("length of max and same frequencies"); print(allmaxfreq); 
print("step-wise most repeated variable-value sequence"); print(stepwisemax); 
print("Sample index with min error"); print(weTrainmin); print("final sample"); 
print(finalsample); 
print("Each row of matrix is terminal node, L for left and R for right node")
indTrainfinal = indTrainlist[[finalsample]]; indTestfinal = 
indTestlist[[finalsample]]
mm = dataset[indTrainfinal,]; ww = dataset[indTestfinal,]
#print(indTrainfinal); print(indTestfinal)
e=0; n <- 0; avelength <- lmsum/noofsamples; units <- c(); misc.units <- c(); 
deviance <- c()
    func=function(data,testdata,level) 
    {
 var1=c(); #print(cbind(ncol(testdata),nrow(testdata)))
 y=table(testdata[,response]); pp=y/nrow(testdata)
 f=matrix(y);
 pp=matrix(pp)
 Y=as.real(names(y))
 z=match(mcat,Y); Y=Y[z]; Y[is.na(Y)]=0
 f=f[z]; f[is.na(f)]=0; sumf <- sum(f)
 pp=pp[z]; pp[is.na(pp)]=0
 devN=f*log(pp); devN[is.na(devN)]=0; dTest=(-2)*sum(matrix(devN)); 
fr=table(data[,response]); wf=as.real(names(which.max(fr))); 
 tecN <- nrow(testdata)-max(table(testdata[,response])); for(q in 
1:response){var1[q]=var(data[,q])}; var1[is.na(var1)]=0 
 if (any(var1==0) || nrow(data)<=5)
 {
  n <<- n + 1; nodeclas[n] <<- wf; nodeside[n] <<- side
  tec1NF <<- tec1NF + tecN; units[n] <<- sumf; misc.units[n] <<- tecN; 
deviance[n] <<- dTest 
  
   return() 
 }else 
  {  
   data1 = data; data2 = testdata; s=sv(data1)[[method]]; s1 <- s[1]; s2=s[2]; 
level <- level +
 1
      
   data=LN(data1,s1,s2); testdata=LN(data2,s1,s2); side <<- c("L")   
    func(data,testdata,level) 
   data=RN(data1,s1,s2); testdata=RN(data2,s1,s2); side <<- c("R")
   func(data,testdata,level)
  } 
    } 
  func(mm,ww,0)
deviance <- round(deviance,digits=2); LorRnode <- nodeside; class <- nodeclas
resMat <- cbind(mat,LorRnode,units,misc.units,deviance,class); print("un-pruned 
tree"); print(resMat)    
eTrain <- round(mean(eTraining)/avelength*100,digits=2); eTest <- 
round(tec1NF/nrow(ww)*100,digits=2); dev.sum <- sum(deviance); 
print(cbind(eTrain,eTest,dev.sum))
n <- 0; vr1<-c(); vv1<-c(); vlp=array(0,dim=c(nc,nlvls,nnodes)); 
vvlp=array(0,dim=c(nc,nlvls,nnodes)); avelength <- lmsum/noofsamples; unit <- 
c(); 
misc.unit <- c(); deviancep <- c(); wf1 <- 0.1; wf2 <- 0.2; vlfp=list(); 
vvlfp=list(); #esum <- c("esum")
    funcp=function(data,testdata,levelp) 
    {
 var1=c(); var1l=c(); var1r=c()
 y=table(testdata[,response]); pp=y/nrow(testdata)
 f=matrix(y); pp=matrix(pp)
 Y=as.real(names(y))
 z=match(mcat,Y); Y=Y[z]; Y[is.na(Y)]=0
 f=f[z]; f[is.na(f)]=0; sumf <- sum(f)
 pp=pp[z]; pp[is.na(pp)]=0
 devN=f*log(pp); devN[is.na(devN)]=0; dTest=(-2)*sum(matrix(devN)); 
fr=table(data[,response]); wf=as.real(names(which.max(fr))); 
 tecN <- nrow(testdata)-max(table(testdata[,response])); e <- 
nrow(data)-max(table(data[,response]))
 for(q in 1:response){var1[q]=var(data[,q])}; var1[is.na(var1)]=0
 if(nrow(data) > 5 && all(var1 !=
 0)){ss=sv(data)[[method]]; ss1=ss[1]; ss2=ss[2]; dat=LN(data,ss1,ss2); 
n1=nrow(dat); 
for(q1 in 1:response){var1l[q1]=var(dat[,q1])}; var1l[is.na(var1l)]=0; e1 <- 
nrow(dat)-max(table(dat[,response])); dat1=RN(data,ss1,ss2); n2=nrow(dat1); 
for(q2 in 1:response){var1r[q2]=var(dat1[,q2])}; var1r[is.na(var1r)]=0; e2 <- 
nrow(dat1)-max(table(dat1[,response])); f1=table(dat[,response]); 
f2=table(dat1[,response]); wf1=as.real(names(which.max(f1))); 
wf2=as.real(names(which.max(f2)))}; #esum <- sum(e1,e2)
 if (any(var1==0) || wf1==wf2 || nrow(data)<=5)   #  || esum==e
 {
  n <<- n + 1; nodeclasp[n] <<- wf; nodesidep[n] <<- side
  if (levelp > 0){ for (b in 1 : levelp)
  {
  vlp[wf,b,n] <<- vr1[b]; vvlp[wf,b,n] <<- vv1[b]  # vl=variable level, 
vvl=variable value
 level
  }}
  tec2NF <<- tec2NF + tecN; unit[n] <<- sumf; misc.unit[n] <<- tecN; 
deviancep[n] <<- dTest 
  
   return() 
 }else 
  {  
   data1 = data; data2 = testdata; s=sv(data1)[[method]]; s1 <- s[1]; s2=s[2]; 
levelp <- levelp + 1
      
   vr1[levelp] <<- s1; vv1[levelp] <<- s2 # vv=variable value
      
   data=LN(data1,s1,s2); testdata=LN(data2,s1,s2); side <<- c("L")   
    funcp(data,testdata,levelp) 
   data=RN(data1,s1,s2); testdata=RN(data2,s1,s2); side <<- c("R")
   funcp(data,testdata,levelp)
  } 
    } 
  funcp(mm,ww,0)
clp=apply(vlp,3,function(vlp) which(rowSums(vlp)!=0))[1:n]; 
for(i in 1:nnodes){for(ii in 1:nc){if(any(vlp[,,i][ii,]!=0)){vlfp[[i]] <- 
vlp[,,i][ii,]; vvlfp[[i]] <- vvlp[,,i][ii,]}}}; vlf1p <- do.call('rbind',vlfp); 
vvlf1p <- do.call('rbind',vvlfp); 
lp=vlf1p; indlp <- colSums(lp) != 0; lp=matrix(lp[,indlp],nrow=nrow(lp)); 
llp=vvlf1p; indllp <- colSums(llp) != 0; 
llp=round(matrix(llp[,indllp],nrow=nrow(llp)),digits=4); 
print("pruned tree"); matp <- 
as.data.frame(matrix(paste(lp,"(",llp,")",sep=""),nrow=nrow(lp))); 
deviance <- round(deviancep,digits=2); LorRnode <- nodesidep; class <- nodeclasp
resMatp <- cbind(matp,LorRnode,unit,misc.unit,deviance,class); print(resMatp)   
 
eTrain <- round(mean(eTraining)/avelength*100,digits=2); eTest <- 
round(tec2NF/nrow(ww)*100,digits=2); dev.sum <- sum(deviance);
 print(cbind(eTrain,eTest,dev.sum))
 }
  CTPrundUnprund(iris,150,1,4,5,1,3,10,10) 




best regards

Muhammad Azam  
        [[alternative HTML version deleted]]

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to