I found that plyr:::daply is more efficient than base:::by (am I doing
something wrong?), below updated code for comparison (I also fixed a couple
things).
Function daply from plyr package has also a .parallel argument and I wonder if
creating timeseries objects in parallel and then combining them would be faster
(Windows XP platform); does someone has experience with this topic? I found
only very simple examples about plyr and parallel computations and I do not
have a working example for such kind of implementation (daply that return a
list of timeseries objects).
Thanks in advance,
Daniele Amberti
set.seed(123)
N <- 10000
X <- data.frame(
ID = c(rep(1,N), rep(2,N,), rep(3,N), rep(4,N)),
DATE = as.character(rep(as.POSIXct("2000-01-01", tz = "GMT")+ 0:(N-1), 4)),
VALUE = runif(N*4), stringsAsFactors = FALSE)
X <- X[sample(1:(N*4), N*4),]
str(X)
library(timeSeries)
buildTimeSeriesFromDataFrame <- function(x, env)
{
{
if(exists("xx", envir = env))
assign("xx",
cbind(get("xx", env), timeSeries(x$VALUE, x$DATE,
format = '%Y-%m-%d %H:%M:%S',
zone = 'GMT', units = as.character(x$ID[1]))),
envir = env)
else
assign("xx",
timeSeries(x$VALUE, x$DATE, format = '%Y-%m-%d %H:%M:%S',
zone = 'GMT', units = as.character(x$ID[1])),
envir = env)
return(TRUE)
}
}
tsBy <- function(...)
{
e1 <- new.env(parent = baseenv())
res <- by(X, X$ID, buildTimeSeriesFromDataFrame,
env = e1, simplify = TRUE)
return(get("xx", e1))
}
Time01 <- replicate(100,
system.time(tsBy(X, X$ID, simplify = TRUE))[[1]])
median(Time01)
hist(Time01)
ATS <- tsBy(X, X$ID, simplify = TRUE)
library(xts)
buildXtsFromDataFrame <- function(x, env)
{
{
if(exists("xx", envir = env))
assign("xx",
cbind(get("xx", env), xts(x$VALUE,
as.POSIXct(x$DATE, tz = "GMT",
format = '%Y-%m-%d %H:%M:%S'),
tzone = 'GMT')),
envir = env)
else
assign("xx",
xts(x$VALUE, as.POSIXct(x$DATE, tz = "GMT",
format = '%Y-%m-%d %H:%M:%S'),
tzone = 'GMT'),
envir = env)
return(TRUE)
}
}
xtsBy <- function(...)
{
e1 <- new.env(parent = baseenv())
res <- by(X, X$ID, buildXtsFromDataFrame,
env = e1, simplify = TRUE)
return(get("xx", e1))
}
Time02 <- replicate(100,
system.time(xtsBy(X, X$ID,simplify = TRUE))[[1]])
median(Time02)
hist(Time02)
AXTS <- xtsBy(X, X$ID, simplify = TRUE)
plot(density(Time02), col = "red",
xlim = c(min(c(Time02, Time01)), max(c(Time02, Time01))))
lines(density(Time01), col = "blue")
#check equal, a still a problem with names
AXTS2 <- as.timeSeries(AXTS)
names(AXTS2) <- names(ATS)
identical(getDataPart(ATS), getDataPart(AXTS2))
identical(time(ATS), time(AXTS2))
# with plyr library and daply instead of by:
library(plyr)
tsDaply <- function(...)
{
e1 <- new.env(parent = baseenv())
res <- daply(X, "ID", buildTimeSeriesFromDataFrame,
env = e1)
return(get("xx", e1))
}
Time03 <- replicate(100,
system.time(tsDaply(X, X$ID))[[1]])
median(Time03)
hist(Time03)
xtsDaply <- function(...)
{
e1 <- new.env(parent = baseenv())
res <- daply(X, "ID", buildXtsFromDataFrame,
env = e1)
return(get("xx", e1))
}
Time04 <- replicate(100,
system.time(xtsDaply(X, X$ID))[[1]])
median(Time04)
hist(Time04)
plot(density(Time04), col = "red",
xlim = c(
min(c(Time02, Time01, Time03, Time04)),
max(c(Time02, Time01, Time03, Time04))),
ylim = c(0,100))
lines(density(Time03), col = "blue")
lines(density(Time02))
lines(density(Time01))
-----Original Message-----
From: Daniele Amberti
Sent: 11 March 2011 14:44
To: [email protected]
Subject: dataframe to a timeseries object
I’m wondering which is the most efficient (time, than memory usage) way to
obtain a multivariate time series object from a data frame (the easiest data
structure to get data from a database trough RODBC).
I have a starting point using timeSeries or xts library (these libraries can
handle time zones), below you can find code to test.
Merging parallelization (cbind) is something I’m thinking at (suggestions from
users with experience on this topic is highly appreciated), any suggestion is
welcome.
My platform is Windows XP, R 2.12.1, latest available packages on CRAN for
timeSeries and xts.
set.seed(123)
N <- 9000
X <- data.frame(
ID = c(rep(1,N), rep(2,N,), rep(3,N), rep(4,N)),
DATE = rep(as.POSIXct("2000-01-01", tz = "GMT")+ 0:(N-1), 4),
VALUE = runif(N*4))
library(timeSeries)
buildTimeSeriesFromDataFrame <- function(x, env)
{
{
if(exists("xx", envir = env))
assign("xx",
cbind(get("xx", env), timeSeries(x$VALUE, x$DATE, format = '%Y-%m-%d
%H:%M:%S',
zone = 'GMT', units = as.character(x$ID[1]))),
envir = env)
else
assign("xx",
timeSeries(x$VALUE, x$DATE, format = '%Y-%m-%d %H:%M:%S',
zone = 'GMT', units = as.character(x$ID[1])),
envir = env)
return(TRUE)
}
}
fooBy <- function(...)
{
e1 <- new.env(parent = baseenv())
res <- by(X, X$ID, buildTimeSeriesFromDataFrame,
env = e1, simplify = TRUE)
return(get("xx", e1))
}
Time01 <- replicate(100,
system.time(fooBy(X,
X$ID, buildTimeSeriesFromDataFrame,
simplify = TRUE))[[1]])
median(Time01)
hist(Time01)
library(xts)
buildXtsFromDataFrame <- function(x, env)
{
{
if(exists("xx", envir = env))
assign("xx",
cbind(get("xx", env), xts(x$VALUE,
as.POSIXct(x$DATE, format = '%Y-%m-%d %H:%M:%S'),
tzone = 'GMT')),
envir = env)
else
assign("xx",
xts(x$VALUE, as.POSIXct(x$DATE, format = '%Y-%m-%d %H:%M:%S'),
tzone = 'GMT'),
envir = env)
return(TRUE)
}
}
fooBy <- function(...)
{
e1 <- new.env(parent = baseenv())
res <- by(X, X$ID, buildXtsFromDataFrame,
env = e1, simplify = TRUE)
return(get("xx", e1))
}
Time02 <- replicate(100,
system.time(fooBy(X,
X$ID, buildTimeSeriesFromDataFrame,
simplify = TRUE))[[1]])
median(Time02)
hist(Time02)
plot(density(Time02), xlim = c(min(c(Time02, Time01)), max(c(Time02, Time01))))
lines(density(Time01))
Best regards,
Daniele Amberti
ORS Srl
Via Agostino Morando 1/3 12060 Roddi (Cn) - Italy
Tel. +39 0173 620211
Fax. +39 0173 620299 / +39 0173 433111
Web Site www.ors.it
------------------------------------------------------------------------------------------------------------------------
Qualsiasi utilizzo non autorizzato del presente messaggio e dei suoi allegati è
vietato e potrebbe costituire reato.
Se lei avesse ricevuto erroneamente questo messaggio, Le saremmo grati se
provvedesse alla distruzione dello stesso
e degli eventuali allegati.
Opinioni, conclusioni o altre informazioni riportate nella e-mail, che non
siano relative alle attività e/o
alla missione aziendale di O.R.S. Srl si intendono non attribuibili alla
società stessa, né la impegnano in alcun modo.
______________________________________________
[email protected] mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.