[R] R help

Mª Teresa Martinez Soriano Tue, 06 Aug 2013 10:36:38 -0700

Hi everyone, I'm sorry for my questions, I'm sure they are totally stupig, but 
I am completely new in this program  and I am facing this "danger" alone


I  have done imputation for one part of my data set, however I am not able for 
doing in general.
this is part of my data set (cast2)
cast2[1:30,]    X.     Fecha1     Fecha2 CEES.NUMERO SECTOR IE.2003 IE.2004 
IE.2005 IE.2006 IE.2007 IE.2008 IE.2009 IE.2010 rS2   15 17/05/1999 10/02/2011  
      7420   APCT     173     125     155      74      NA      NA      13      
NA  35   23 27/06/1998 18/06/2013        4941     TA    1358    1803    2115    
2390    2506    2320    2008    2007  011  58  4/12/1997 18/06/2013        4772 
   CRV      93     179     221     196     297     191     126     112  015  87 
30/09/2004 18/06/2013        4121      C      NA      31     390    1246    
3762    1430      NA      NA  316  94  1/03/2006 18/06/2013        4121      C  
    NA      NA      NA     212     513     706     202     127  317  97 
20/12/2005 18/06/2013        4110      C      NA      NA      NA      64      
98     251      79     176  320 133 30/09/2002 18/06/2013        7112   APCT    
 153     279     289     370     412     262     115      75  021 138 
11/07/2002 13/05/2009        4121      C    5460    7863    8365   12009   
16763      NA      NA      NA  323 152 27/05/1999 18/06/2013        7490   APCT 
     NA      80      77      60      89     137     144     146  124 154 
21/12/2004 18/06/2013        6820     AI      NA      NA     148     186     
302     233     194     204  226 177 20/02/1996 18/06/2013        7490   APCT   
   16       4      NA       3       3      NA       5       5  227 185  
6/03/1992 12/08/2011        6820     AI      26      NA      21      21      NA 
     21      21      16  232 231 14/03/2001 27/06/2011        6810     AI      
NA      63      76      79      72       5      NA      NA  338 272 28/03/2001 
18/06/2013        4110      C    2462    5571    5880    6159    6951     927   
 1102    1289  040 288 12/02/1997 18/06/2013        5630      H     307     671 
    805     979      NA     558     238     449  141 306  1/01/2000 18/06/2013  
      7311   APCT     161     200     250     250     263     161      43      
50  042 311 21/02/2001 18/06/2013        6831     AI      NA      51      89    
  69     135      28      11      12  147 373 18/07/1995 18/06/2013        4619 
   CRV     159      NA      NA      NA     161     192     208     230  349 389 
27/07/1990 18/06/2013        5610      H     686     750     749     783     
795     645     514     415  054 410 19/11/1992 18/06/2013        6920   APCT   
  330     290     290     342     387     415     465     421  055 420  
9/01/2004 18/06/2013        5610      H      NA     205     335     267     234 
    211     194     204  159 443 18/01/2005 18/06/2013        4110      C      
NA      NA       7     702     957     195    1489       5  263 463 13/03/2006 
18/06/2013        7311   APCT      NA      NA      NA      71     190     219   
  172     109  364 465 16/01/1995 18/06/2013        6920   APCT       7      42 
     42      42      90      60      36      12  071 503  8/06/1992 18/06/2013  
      2512     IM     470     551     549     582     638     618     510     
472  073 510 12/02/1997 18/06/2013        4759    CRV     182     212     293   
  299     322     226     231      NA  176 527 26/09/2003 18/06/2013        
7111   APCT      30     112     144      73      NA     171      51      68  
178 548 19/07/2002 18/06/2013        4673    CRV     158     951    1025     
301     112     358      18       8  079 552  4/11/1997 07/09/2011        4675  
  CRV    7868    9420   10772   15140   14843   12682    9704   14077  082 603  
1/01/1996 18/06/2013        4334      C      47      49      69      NA      NA 
     80      96      76  2

setwd("C:/rprueba")# indicar donde están nuestros 
datoscastellon<-read.delim("clipboard", header=T, dec=",",check.names=T)

PASO 1, #contar missing values, lo hacemos el primer paso para quitar las 
empresas que no vamos a utilizar  
 rS<-rowSums(is.na(castellon[,18:24])) #sacamos el número de NA que hay en cada 
fila castellon["rS"]<-rowSums(is.na(castellon))#unimos columna de rS= numero de 
Na d<-dim(castellon)[2]#traem el numero de la ultima columan de la df 
p<-which(castellon[ ,d]<=3,arr.ind=T)#cast selecc filas con menos de 4 missing 
values cast<-castellon[p,]#guardamos como nueva data.frame esas filas 
cast[1:20,]
PASO 2, # de las empresas(mis.val<=3), dividimos la muestra por columnas q. nos 
interese
#cast1 son las primeras columnas que nos dan informacion #cast2 son numero 
empresa más variable para imputar
cast1<-cast[,c(1:12,14:16)]
cast2<-cast[,c(1,8,12,15:25)]#con las fechas

x<-split(cast1,cast1$SECTOR)y<-split(cast2,cast2$SECTOR)
for(i in 1:length(x)){  write.table(x[i],paste(paste("cast1_sector", i), 
".csv"),col.names=T,row.names=FALSE)   
write.table(y[i],paste(paste("cast2_sector", i), 
".csv"),col.names=T,row.names=FALSE)}

# This variables I don't want to impute;
for(i in 1:length(x)) # x o y dá igual, tienen la misma longitud
{y[[i]]$Fecha1<-as.character(y[[i]]$Fecha1)y[[i]]$Fecha2<-as.character(y[[i]]$Fecha2)}
dontimpute2005<-lapply(y,function(x)which(as.numeric(unlist(sapply(strsplit(x$Fecha1,"/"),"[",3)))
 > 2005 &is.na(x$IE.2005)))
dontimpute2005.<-lapply(y,function(x)which(as.numeric(unlist(sapply(strsplit(x$Fecha2,"/"),"[",3)))<
 2005 & is.na(x$IE.2005)))di2005<-c(dontimpute2005,dontimpute2005.)
dontimpute2006<-which(as.numeric(unlist(sapply(strsplit(y[[i]]$Fecha1,"/"),"[",3)))
 > 2006 &is.na(y[[i]]$IE.2006))
dontimpute2006.<-which(as.numeric(unlist(sapply(strsplit(y[[i]]$Fecha2,"/"),"[",3)))<
 2006 & is.na(y[[i]]$IE.2006))di2006<-c(dontimpute2006,dontimpute2006.)
dontimpute2007<-which(as.numeric(unlist(sapply(strsplit(y[[i]]$Fecha1,"/"),"[",3)))
 > 2007 &is.na(y[[i]]$IE.2007))
dontimpute2007.<-which(as.numeric(unlist(sapply(strsplit(y[[i]]$Fecha2,"/"),"[",3)))<
 2007 & is.na(y[[i]]$IE.2007))di2007<-c(dontimpute2007,dontimpute2007.)
dontimpute2008<-which(as.numeric(unlist(sapply(strsplit(y[[i]]$Fecha1,"/"),"[",3)))
 > 2008 &is.na(y[[i]]$IE.2008))
dontimpute2008.<-which(as.numeric(unlist(sapply(strsplit(y[[i]]$Fecha2,"/"),"[",3)))<
 2008 & is.na(y[[i]]$IE.2008))di2008<-c(dontimpute2008,dontimpute2008.)
dontimpute2009<-which(as.numeric(unlist(sapply(strsplit(y[[i]]$Fecha1,"/"),"[",3)))
 > 2009 &is.na(y[[i]]$IE.2009))
dontimpute2009.<-which(as.numeric(unlist(sapply(strsplit(y[[i]]$Fecha2,"/"),"[",3)))<
 2009 & is.na(y[[i]]$IE.2009))di2009<-c(dontimpute2009,dontimpute2009.)
dontimpute2010<-which(as.numeric(unlist(sapply(strsplit(y[[i]]$Fecha1,"/"),"[",3)))
 > 2010 &is.na(y[[i]]$IE.2010))
dontimpute2010.<-which(as.numeric(unlist(sapply(strsplit(y[[i]]$Fecha2,"/"),"[",3)))<
 2010 & is.na(y[[i]]$IE.2010))di2010<-c(dontimpute2010,dontimpute2010.)


SInce here, I am not able to continue the code but in general, so I have done 
for one Sector (y[[6]]), could you help me please?

datos<-y[[6]]
#PASO 3, IMPUTACION
 variables<-names(datos)[6:13] # In general I have tried this 
variables<-lapply(datos,function(x)names(x)[6:13])
plot(datos[,6], type="l", main= "SECTOR" )for(i in 6:13) lines(datos[,i], 
type="l", col=i)
# si hay algún valor muy raro quitarlo..dat_<-datos[,variables]  # In general I 
have tried this lapply(datos, function(x)x[,variables]) #but I get Error en 
(x)[, variables] : número incorreto de dimensiones ( incorrect number of 
dimensions)

 library(mice) md.pattern(dat_) # forma una matriz, la 1ªfila son los datos 
completos que hay, 0 representa missing values y 1 valor conocido 
md.pairs(dat_) #r means observed, m means missing
mod1=mice(dat_, method=c("",rep("pmm",7))) 
predictor=mod1$predictorMatrix
mod1<-mice(dat_, method="pmm", pred=predictor)# crear imputaciones, mod1 class 
mids
imputados<-complete(mod1,'long') # with long we specify that we want 5 imp
 x.imp=split(imputados, imputados$.imp)
#sumamos las 5 imputaciones
acumula=x.imp[[1]][,-c(1,2)]for(i in 2:length(x.imp)) 
acumula=acumula+x.imp[[i]][,-c(1,2)]
# Promediamos  med.imp=acumula/5 a<-med.imp
# Dibujamos la gráfica datos originales y datos imputados 
plot(dat_[,1],type='l',main="r10 imp") for(i in 2:7) lines (dat_[,i], type='l', 
col=i) plot(a[,1],type='l',main="r10 orig") for(i in 2:7) lines (a[,i], 
type='l', col=i)
#Hacemos un resumen de los datos que hemos obtenidosummary(a)summary(dat_)
# Gráfica de ditribución de datos imputados
library(lattice)com<-complete(mod1,"long",inc=T)col<-rep(c("blue","red")[1+as.numeric(is.na(mod1$data$C.IE.2005))],6)stripplot(C.IE.2005~mod1,
 data=imputados,jit=TRUE,fac=0.8, col=col,pch=20, cex=1.4, xlab="Imputation 
number")# comprobar
#unir los datos imputados con las columnas que hemos separado antes (cast1, 
tienen "NUMERO" en común)
NUMERO<-datos[,1]u<-cbind(NUMERO,a)     out<-cbind(x[[6]],u) # un poc garrulo

# Ponemos NA donde no tiene que haber ningún valor
out$IE.2005[di2005]<-NA 
out$IE.2006[di2006]<-NAout$IE.2007[di2007]<-NAout$IE.2008[di2008]<-NAout$IE.2009[di2009]<-NAout$IE.2010[di2010]<-NA




Thanks in advance, sorry for askingBest regards Teresa
                                          
        [[alternative HTML version deleted]]

______________________________________________
[email protected] mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

[R] R help

Reply via email to