Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-24 Thread Simon Urbanek
 % src_length];
 }
 
 int main()
 {
   int nt, ns, kmax, k;
   int *t, *s;
 
   nt = 3;
   ns = 1000;
   kmax = 100;
   t = (int *) malloc(nt * sizeof(int));
   s = (int *) malloc(ns * sizeof(int));
   for (k = 0; k  kmax; k++)
   //copy_ints(s, ns, t, nt);
   copy_ints2(s, ns, t, nt);
   return 0;
 }
 
 Note that the function that actually does the job is
 memcpy_with_recycling_of_src(). It can be reused for copying
 vectors with elements of an arbitrary size.
 
 Cheers,
 H.
 
 Cheers,
 H.
 
 Matthew
 
 William Dunlap wdun...@tibco.com wrote in message 
 news:77eb52c6dd32ba4d87471dcd70c8d70002ce6...@na-pa-vbe03.na.tibco.com...
 If I were worried about the time this loop takes,
 I would avoid using i%nt.  For the attached C code
 compile with gcc 4.3.3 with -O2 I get
   # INTEGER() in loop
   system.time( r1 - .Call(my_rep1, 1:3, 1e7) )
 user  system elapsed
0.060   0.012   0.071
 
   # INTEGER() before loop
   system.time( r2 - .Call(my_rep2, 1:3, 1e7) )
 user  system elapsed
0.076   0.008   0.086
 
   # replace i%src_length in loop with j=0 before loop and
   #if(++j==src_length) j=0 ;
   # in the loop.
   system.time( r3 - .Call(my_rep3, 1:3, 1e7) )
 user  system elapsed
0.024   0.028   0.050
   identical(r1,r2)  identical(r2,r3)
  [1] TRUE
 
 The C code is:
 #define USE_RINTERNALS /* pretend we are in the R kernel */
 #include R.h
 #include Rinternals.h
 
 
 SEXP my_rep1(SEXP s_src, SEXP s_dest_length)
 {
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int i,j ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
for(i=0;idest_length;i++) {
INTEGER(s_dest)[i] = INTEGER(s_src)[i % src_length] ;
}
UNPROTECT(1) ;
return s_dest ;
 }
 SEXP my_rep2(SEXP s_src, SEXP s_dest_length)
 {
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int *psrc = INTEGER(s_src) ;
int *pdest ;
int i ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
pdest = INTEGER(s_dest) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
/* end of boilerplate */
for(i=0;idest_length;i++) {
pdest[i] = psrc[i % src_length] ;
}
UNPROTECT(1) ;
return s_dest ;
 }
 SEXP my_rep3(SEXP s_src, SEXP s_dest_length)
 {
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int *psrc = INTEGER(s_src) ;
int *pdest ;
int i,j ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
pdest = INTEGER(s_dest) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
/* end of boilerplate */
for(j=0,i=0;idest_length;i++) {
*pdest++ = psrc[j++] ;
if (j==src_length) {
j = 0 ;
}
}
UNPROTECT(1) ;
return s_dest ;
 }
 
 Bill Dunlap
 Spotfire, TIBCO Software
 wdunlap tibco.com
 
 -Original Message-
 From: r-devel-boun...@r-project.org
 [mailto:r-devel-boun...@r-project.org] On Behalf Of Romain Francois
 Sent: Wednesday, April 21, 2010 12:32 PM
 To: Matthew Dowle
 Cc: r-de...@stat.math.ethz.ch
 Subject: Re: [Rd] suggestion how to use memcpy in duplicate.c
 
 Le 21/04/10 17:54, Matthew Dowle a écrit :
 From copyVector in duplicate.c :
 void copyVector(SEXP s, SEXP t)
 {
 int i, ns, nt;
 nt = LENGTH(t);
 ns = LENGTH(s);
 switch (TYPEOF(s)) {
 ...
 case INTSXP:
 for (i = 0; i  ns; i++)
 INTEGER(s)[i] = INTEGER(t)[i % nt];
 break;
 ...
 
 could that be replaced with :
 
 case INTSXP:
 for (i=0; ins/nt; i++)
 memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char
 *)DATAPTR(t),
 nt*sizeof(int));
 break;
 or at least with something like this:
 
 int* p_s = INTEGER(s) ;
 int* p_t = INTEGER(t) ;
 for( i=0 ; i  ns ; i++){
 p_s[i] = p_t[i % nt];
 }
 
 since expanding the INTEGER macro over and over has a price.
 
 and similar for the other types in copyVector.  This won't
 help regular
 vector copies, since those seem to be done by the
 DUPLICATE_ATOMIC_VECTOR
 macro, see next suggestion below, but it should help
 copyMatrix which calls
 copyVector, scan.c which calls copyVector on three lines,
 dcf.c (once) and
 dounzip.c (once).
 
 For the DUPLICATE_ATOMIC_VECTOR macro there is already a
 comment next to it
 :
 
 FIXME: surely memcpy would be faster here?
 
 which seems to refer to the for loop  :
 
 else { \
 int __i__; \
 type *__fp__ = fun(from), *__tp__ = fun(to); \
 for (__i__ = 0; __i__  __n__; __i__++) \
   __tp__[__i__] = __fp__[__i__]; \
   } \
 
 Could that loop be replaced by the following ?
 
else { \
memcpy((char *)DATAPTR(to), (char *)DATAPTR(from),
 __n__*sizeof(type)); \
}\
 
 In the data.table package, dogroups.c uses this technique,
 so the principle
 is tested and works well so far

Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-23 Thread Hervé Pagès
-project.org
[mailto:r-devel-boun...@r-project.org] On Behalf Of Romain Francois
Sent: Wednesday, April 21, 2010 12:32 PM
To: Matthew Dowle
Cc: r-de...@stat.math.ethz.ch
Subject: Re: [Rd] suggestion how to use memcpy in duplicate.c

Le 21/04/10 17:54, Matthew Dowle a écrit :

From copyVector in duplicate.c :

void copyVector(SEXP s, SEXP t)
{
 int i, ns, nt;
 nt = LENGTH(t);
 ns = LENGTH(s);
 switch (TYPEOF(s)) {
...
 case INTSXP:
 for (i = 0; i  ns; i++)
 INTEGER(s)[i] = INTEGER(t)[i % nt];
 break;
...

could that be replaced with :

 case INTSXP:
 for (i=0; ins/nt; i++)
 memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char

*)DATAPTR(t),

nt*sizeof(int));
 break;

or at least with something like this:

int* p_s = INTEGER(s) ;
int* p_t = INTEGER(t) ;
for( i=0 ; i  ns ; i++){
p_s[i] = p_t[i % nt];
}

since expanding the INTEGER macro over and over has a price.


and similar for the other types in copyVector.  This won't

help regular

vector copies, since those seem to be done by the

DUPLICATE_ATOMIC_VECTOR

macro, see next suggestion below, but it should help

copyMatrix which calls

copyVector, scan.c which calls copyVector on three lines,

dcf.c (once) and

dounzip.c (once).

For the DUPLICATE_ATOMIC_VECTOR macro there is already a

comment next to it

:

 FIXME: surely memcpy would be faster here?

which seems to refer to the for loop  :

 else { \
 int __i__; \
 type *__fp__ = fun(from), *__tp__ = fun(to); \
 for (__i__ = 0; __i__  __n__; __i__++) \
   __tp__[__i__] = __fp__[__i__]; \
   } \

Could that loop be replaced by the following ?

else { \
memcpy((char *)DATAPTR(to), (char *)DATAPTR(from),

__n__*sizeof(type)); \

}\

In the data.table package, dogroups.c uses this technique,

so the principle

is tested and works well so far.

Are there any road blocks preventing this change, or is

anyone already

working on it ?  If not then I'll try and test it (on

Ubuntu 32bit) and

submit patch with timings, as before.  Comments/pointers

much appreciated.

Matthew

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel




--
Romain Francois
Professional R Enthusiast
+33(0) 6 28 91 30 30
http://romainfrancois.blog.free.fr
|- http://bit.ly/9aKDM9 : embed images in Rd documents
|- http://tr.im/OIXN : raster images and RImageJ
|- http://tr.im/OcQe : Rcpp 0.7.7

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel



__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


--
Hervé Pagès

Program in Computational Biology
Division of Public Health Sciences
Fred Hutchinson Cancer Research Center
1100 Fairview Ave. N, M2-B876
P.O. Box 19024
Seattle, WA 98109-1024

E-mail: hpa...@fhcrc.org
Phone:  (206) 667-5791
Fax:(206) 667-1319

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-23 Thread Hervé Pagès
 with gcc 4.3.3 with -O2 I get
   # INTEGER() in loop
   system.time( r1 - .Call(my_rep1, 1:3, 1e7) )
 user  system elapsed
0.060   0.012   0.071

   # INTEGER() before loop
   system.time( r2 - .Call(my_rep2, 1:3, 1e7) )
 user  system elapsed
0.076   0.008   0.086

   # replace i%src_length in loop with j=0 before loop and
   #if(++j==src_length) j=0 ;
   # in the loop.
   system.time( r3 - .Call(my_rep3, 1:3, 1e7) )
 user  system elapsed
0.024   0.028   0.050
   identical(r1,r2)  identical(r2,r3)
  [1] TRUE

The C code is:
#define USE_RINTERNALS /* pretend we are in the R kernel */
#include R.h
#include Rinternals.h


SEXP my_rep1(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int i,j ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
for(i=0;idest_length;i++) {
INTEGER(s_dest)[i] = INTEGER(s_src)[i % src_length] ;
}
UNPROTECT(1) ;
return s_dest ;
}
SEXP my_rep2(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int *psrc = INTEGER(s_src) ;
int *pdest ;
int i ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
pdest = INTEGER(s_dest) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
/* end of boilerplate */
for(i=0;idest_length;i++) {
pdest[i] = psrc[i % src_length] ;
}
UNPROTECT(1) ;
return s_dest ;
}
SEXP my_rep3(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int *psrc = INTEGER(s_src) ;
int *pdest ;
int i,j ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
pdest = INTEGER(s_dest) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
/* end of boilerplate */
for(j=0,i=0;idest_length;i++) {
*pdest++ = psrc[j++] ;
if (j==src_length) {
j = 0 ;
}
}
UNPROTECT(1) ;
return s_dest ;
}

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com


-Original Message-
From: r-devel-boun...@r-project.org
[mailto:r-devel-boun...@r-project.org] On Behalf Of Romain Francois
Sent: Wednesday, April 21, 2010 12:32 PM
To: Matthew Dowle
Cc: r-de...@stat.math.ethz.ch
Subject: Re: [Rd] suggestion how to use memcpy in duplicate.c

Le 21/04/10 17:54, Matthew Dowle a écrit :

From copyVector in duplicate.c :

void copyVector(SEXP s, SEXP t)
{
 int i, ns, nt;
 nt = LENGTH(t);
 ns = LENGTH(s);
 switch (TYPEOF(s)) {
...
 case INTSXP:
 for (i = 0; i  ns; i++)
 INTEGER(s)[i] = INTEGER(t)[i % nt];
 break;
...

could that be replaced with :

 case INTSXP:
 for (i=0; ins/nt; i++)
 memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char

*)DATAPTR(t),

nt*sizeof(int));
 break;

or at least with something like this:

int* p_s = INTEGER(s) ;
int* p_t = INTEGER(t) ;
for( i=0 ; i  ns ; i++){
p_s[i] = p_t[i % nt];
}

since expanding the INTEGER macro over and over has a price.


and similar for the other types in copyVector.  This won't

help regular

vector copies, since those seem to be done by the

DUPLICATE_ATOMIC_VECTOR

macro, see next suggestion below, but it should help

copyMatrix which calls

copyVector, scan.c which calls copyVector on three lines,

dcf.c (once) and

dounzip.c (once).

For the DUPLICATE_ATOMIC_VECTOR macro there is already a

comment next to it

:

 FIXME: surely memcpy would be faster here?

which seems to refer to the for loop  :

 else { \
 int __i__; \
 type *__fp__ = fun(from), *__tp__ = fun(to); \
 for (__i__ = 0; __i__  __n__; __i__++) \
   __tp__[__i__] = __fp__[__i__]; \
   } \

Could that loop be replaced by the following ?

else { \
memcpy((char *)DATAPTR(to), (char *)DATAPTR(from),

__n__*sizeof(type)); \

}\

In the data.table package, dogroups.c uses this technique,

so the principle

is tested and works well so far.

Are there any road blocks preventing this change, or is

anyone already

working on it ?  If not then I'll try and test it (on

Ubuntu 32bit) and

submit patch with timings, as before.  Comments/pointers

much appreciated.

Matthew

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel




--
Romain Francois
Professional R Enthusiast
+33(0) 6 28 91 30 30
http://romainfrancois.blog.free.fr
|- http://bit.ly/9aKDM9 : embed images in Rd documents
|- http://tr.im/OIXN : raster images and RImageJ
|- http://tr.im/OcQe : Rcpp 0.7.7

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel



__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r

Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-23 Thread Hervé Pagès

Hervé Pagès wrote:
[...]

Code:
===
#include stdio.h
#include string.h
#include stdlib.h

void memcpy_with_recycling_of_src(char *dest, size_t dest_nblocks,
  const char *src, size_t src_nblocks,
  size_t blocksize)
{
int i, imax, q;
size_t src_size;

imax = dest_nblocks - src_nblocks;
src_size = src_nblocks * blocksize;
for (i = 0; i = imax; i += src_nblocks) {
memcpy(dest, src, src_size);
dest += src_size;
i += src_nblocks;

  ^
  //i += src_nblocks;

oops, take this out!

Of course copy_ints is twice slower now but is still
about 2.5x faster than copy_ints2 (the copyVector
way) for copying a length 3 vector recycled 3.3 million
times. For a length 1000 vector being recycled 25 times,
it's about 17x faster.

Cheers,
H.


}
q = dest_nblocks - i;
if (q  0)
memcpy(dest, src, q * blocksize);
return;
}

void copy_ints(int *dest, int dest_length,
   const int *src, int src_length)
{
memcpy_with_recycling_of_src((char *) dest, dest_length,
 (char *) src, src_length,
 sizeof(int));
}

/* the copyVector() way */
void copy_ints2(int *dest, int dest_length,
const int *src, int src_length)
{
int i;

for (i = 0; i  dest_length; i++)
dest[i] = src[i % src_length];
}

int main()
{
int nt, ns, kmax, k;
int *t, *s;

nt = 3;
ns = 1000;
kmax = 100;
t = (int *) malloc(nt * sizeof(int));
s = (int *) malloc(ns * sizeof(int));
for (k = 0; k  kmax; k++)
//copy_ints(s, ns, t, nt);
copy_ints2(s, ns, t, nt);
return 0;
}

Note that the function that actually does the job is
memcpy_with_recycling_of_src(). It can be reused for copying
vectors with elements of an arbitrary size.

Cheers,
H.



Cheers,
H.



Matthew

William Dunlap wdun...@tibco.com wrote in message 
news:77eb52c6dd32ba4d87471dcd70c8d70002ce6...@na-pa-vbe03.na.tibco.com... 


If I were worried about the time this loop takes,
I would avoid using i%nt.  For the attached C code
compile with gcc 4.3.3 with -O2 I get
   # INTEGER() in loop
   system.time( r1 - .Call(my_rep1, 1:3, 1e7) )
 user  system elapsed
0.060   0.012   0.071

   # INTEGER() before loop
   system.time( r2 - .Call(my_rep2, 1:3, 1e7) )
 user  system elapsed
0.076   0.008   0.086

   # replace i%src_length in loop with j=0 before loop and
   #if(++j==src_length) j=0 ;
   # in the loop.
   system.time( r3 - .Call(my_rep3, 1:3, 1e7) )
 user  system elapsed
0.024   0.028   0.050
   identical(r1,r2)  identical(r2,r3)
  [1] TRUE

The C code is:
#define USE_RINTERNALS /* pretend we are in the R kernel */
#include R.h
#include Rinternals.h


SEXP my_rep1(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int i,j ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
for(i=0;idest_length;i++) {
INTEGER(s_dest)[i] = INTEGER(s_src)[i % src_length] ;
}
UNPROTECT(1) ;
return s_dest ;
}
SEXP my_rep2(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int *psrc = INTEGER(s_src) ;
int *pdest ;
int i ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
pdest = INTEGER(s_dest) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
/* end of boilerplate */
for(i=0;idest_length;i++) {
pdest[i] = psrc[i % src_length] ;
}
UNPROTECT(1) ;
return s_dest ;
}
SEXP my_rep3(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int *psrc = INTEGER(s_src) ;
int *pdest ;
int i,j ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
pdest = INTEGER(s_dest) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
/* end of boilerplate */
for(j=0,i=0;idest_length;i++) {
*pdest++ = psrc[j++] ;
if (j==src_length) {
j = 0 ;
}
}
UNPROTECT(1) ;
return s_dest ;
}

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com


-Original Message-
From: r-devel-boun...@r-project.org
[mailto:r-devel-boun...@r-project.org] On Behalf Of Romain Francois
Sent: Wednesday, April 21, 2010 12:32 PM
To: Matthew Dowle
Cc: r-de...@stat.math.ethz.ch
Subject: Re: [Rd] suggestion how to use memcpy in duplicate.c

Le 21/04/10 17:54, Matthew Dowle a écrit :

From copyVector in duplicate.c :

void copyVector(SEXP s, SEXP t)
{
 int i, ns, nt;
 nt = LENGTH(t);
 ns = LENGTH(s);
 switch (TYPEOF(s)) {
...
 case INTSXP:
 for (i = 0; i  ns; i++)
 INTEGER(s)[i] = INTEGER(t)[i

Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-22 Thread Matthew Dowle

Is this a thumbs up for memcpy for DUPLICATE_ATOMIC_VECTOR at least ?

If there is further specific testing then let me know, happy to help, but 
you seem to have beaten me to it.

Matthew


Simon Urbanek simon.urba...@r-project.org wrote in message 
news:65d21b93-a737-4a94-bdf4-ad7e90518...@r-project.org...

 On Apr 21, 2010, at 2:15 PM, Seth Falcon wrote:

 On 4/21/10 10:45 AM, Simon Urbanek wrote:
 Won't that miss the last incomplete chunk? (and please don't use
 DATAPTR on INTSXP even though the effect is currently the same)

 In general it seems that the it depends on nt whether this is
 efficient or not since calls to short memcpy are expensive (very
 small nt that is).

 I ran some empirical tests to compare memcpy vs for() (x86_64, OS X)
 and the results were encouraging - depending on the size of the
 copied block the difference could be quite big: tiny block (ca. n =
 32 or less) - for() is faster small block (n ~ 1k) - memcpy is ca. 8x
 faster as the size increases the gap closes (presumably due to RAM
 bandwidth limitations) so for n = 512M it is ~30%.


 Of course this is contingent on the implementation of memcpy,
 compiler, architecture etc. And will only matter if copying is what
 you do most of the time ...

 Copying of vectors is something that I would expect to happen fairly 
 often in many applications of R.

 Is for() faster on small blocks by enough that one would want to branch 
 based on size?


 Good question. Given that the branching itself adds overhead possibly not. 
 In the best case for() can be ~40% faster (for single-digit n) but that 
 means billions of copies to make a difference (since the operation itself 
 is so fast). The break-even point on my test machine is n=32 and when I 
 added the branching it took 20% hit so I guess it's simply not worth it. 
 The only case that may be worth branching is n:1 since that is likely a 
 fairly common use (the branching penalty in copy routines is lower than 
 comparing memcpy/for implementations since the branching can be done 
 before the outer for loop so this may vary case-by-case).

 Cheers,
 Simon


__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-22 Thread Matthew Dowle

Just to add some clarification, the suggestion wasn't motivated by speeding 
up a length 3 vector being recycled 3.3 million times.  But its a good point 
that any change should not make that case slower.  I don't know how much 
vectorCopy is called really,  DUPLICATE_ATOMIC_VECTOR seems more 
significant, which doesn't recycle, and already had the FIXME next to it.

Where copyVector is passed a large source though, then memcpy should be 
faster than any of the methods using a for loop through each element 
(whether recycling or not),  allowing for the usual caveats. What are the 
timings like if you repeat the for loop 100 times to get a more robust 
timing ?  It needs to be a repeat around the for loop only, not the 
allocVector whose variance looks to be included in those timings below. Then 
increase the size of the source vector,  and compare to memcpy.

Matthew

William Dunlap wdun...@tibco.com wrote in message 
news:77eb52c6dd32ba4d87471dcd70c8d70002ce6...@na-pa-vbe03.na.tibco.com...
If I were worried about the time this loop takes,
I would avoid using i%nt.  For the attached C code
compile with gcc 4.3.3 with -O2 I get
   # INTEGER() in loop
   system.time( r1 - .Call(my_rep1, 1:3, 1e7) )
 user  system elapsed
0.060   0.012   0.071

   # INTEGER() before loop
   system.time( r2 - .Call(my_rep2, 1:3, 1e7) )
 user  system elapsed
0.076   0.008   0.086

   # replace i%src_length in loop with j=0 before loop and
   #if(++j==src_length) j=0 ;
   # in the loop.
   system.time( r3 - .Call(my_rep3, 1:3, 1e7) )
 user  system elapsed
0.024   0.028   0.050
   identical(r1,r2)  identical(r2,r3)
  [1] TRUE

The C code is:
#define USE_RINTERNALS /* pretend we are in the R kernel */
#include R.h
#include Rinternals.h


SEXP my_rep1(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int i,j ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
for(i=0;idest_length;i++) {
INTEGER(s_dest)[i] = INTEGER(s_src)[i % src_length] ;
}
UNPROTECT(1) ;
return s_dest ;
}
SEXP my_rep2(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int *psrc = INTEGER(s_src) ;
int *pdest ;
int i ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
pdest = INTEGER(s_dest) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
/* end of boilerplate */
for(i=0;idest_length;i++) {
pdest[i] = psrc[i % src_length] ;
}
UNPROTECT(1) ;
return s_dest ;
}
SEXP my_rep3(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int *psrc = INTEGER(s_src) ;
int *pdest ;
int i,j ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
pdest = INTEGER(s_dest) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
/* end of boilerplate */
for(j=0,i=0;idest_length;i++) {
*pdest++ = psrc[j++] ;
if (j==src_length) {
j = 0 ;
}
}
UNPROTECT(1) ;
return s_dest ;
}

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com

 -Original Message-
 From: r-devel-boun...@r-project.org
 [mailto:r-devel-boun...@r-project.org] On Behalf Of Romain Francois
 Sent: Wednesday, April 21, 2010 12:32 PM
 To: Matthew Dowle
 Cc: r-de...@stat.math.ethz.ch
 Subject: Re: [Rd] suggestion how to use memcpy in duplicate.c

 Le 21/04/10 17:54, Matthew Dowle a écrit :
 
  From copyVector in duplicate.c :
 
  void copyVector(SEXP s, SEXP t)
  {
   int i, ns, nt;
   nt = LENGTH(t);
   ns = LENGTH(s);
   switch (TYPEOF(s)) {
  ...
   case INTSXP:
   for (i = 0; i  ns; i++)
   INTEGER(s)[i] = INTEGER(t)[i % nt];
   break;
  ...
 
  could that be replaced with :
 
   case INTSXP:
   for (i=0; ins/nt; i++)
   memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char
 *)DATAPTR(t),
  nt*sizeof(int));
   break;

 or at least with something like this:

 int* p_s = INTEGER(s) ;
 int* p_t = INTEGER(t) ;
 for( i=0 ; i  ns ; i++){
 p_s[i] = p_t[i % nt];
 }

 since expanding the INTEGER macro over and over has a price.

  and similar for the other types in copyVector.  This won't
 help regular
  vector copies, since those seem to be done by the
 DUPLICATE_ATOMIC_VECTOR
  macro, see next suggestion below, but it should help
 copyMatrix which calls
  copyVector, scan.c which calls copyVector on three lines,
 dcf.c (once) and
  dounzip.c (once).
 
  For the DUPLICATE_ATOMIC_VECTOR macro there is already a
 comment next to it
  :
 
   FIXME: surely memcpy would be faster here?
 
  which seems to refer to the for loop  :
 
   else { \
   int __i__; \
   type *__fp__ = fun(from), *__tp__ = fun(to); \
   for (__i__

Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-22 Thread Simon Urbanek

On Apr 22, 2010, at 7:12 AM, Matthew Dowle wrote:

 
 Is this a thumbs up for memcpy for DUPLICATE_ATOMIC_VECTOR at least ?
 
 If there is further specific testing then let me know, happy to help, but 
 you seem to have beaten me to it.
 

I was not volunteering to do anything - I was just looking at whether it makes 
sense to bother at all and pointing out the bugs in your code ;). I have a 
sufficiently long list of TODOs already :P

Cheers,
Simon


 
 Simon Urbanek simon.urba...@r-project.org wrote in message 
 news:65d21b93-a737-4a94-bdf4-ad7e90518...@r-project.org...
 
 On Apr 21, 2010, at 2:15 PM, Seth Falcon wrote:
 
 On 4/21/10 10:45 AM, Simon Urbanek wrote:
 Won't that miss the last incomplete chunk? (and please don't use
 DATAPTR on INTSXP even though the effect is currently the same)
 
 In general it seems that the it depends on nt whether this is
 efficient or not since calls to short memcpy are expensive (very
 small nt that is).
 
 I ran some empirical tests to compare memcpy vs for() (x86_64, OS X)
 and the results were encouraging - depending on the size of the
 copied block the difference could be quite big: tiny block (ca. n =
 32 or less) - for() is faster small block (n ~ 1k) - memcpy is ca. 8x
 faster as the size increases the gap closes (presumably due to RAM
 bandwidth limitations) so for n = 512M it is ~30%.
 
 
 Of course this is contingent on the implementation of memcpy,
 compiler, architecture etc. And will only matter if copying is what
 you do most of the time ...
 
 Copying of vectors is something that I would expect to happen fairly 
 often in many applications of R.
 
 Is for() faster on small blocks by enough that one would want to branch 
 based on size?
 
 
 Good question. Given that the branching itself adds overhead possibly not. 
 In the best case for() can be ~40% faster (for single-digit n) but that 
 means billions of copies to make a difference (since the operation itself 
 is so fast). The break-even point on my test machine is n=32 and when I 
 added the branching it took 20% hit so I guess it's simply not worth it. 
 The only case that may be worth branching is n:1 since that is likely a 
 fairly common use (the branching penalty in copy routines is lower than 
 comparing memcpy/for implementations since the branching can be done 
 before the outer for loop so this may vary case-by-case).
 
 Cheers,
 Simon
 
 
 __
 R-devel@r-project.org mailing list
 https://stat.ethz.ch/mailman/listinfo/r-devel
 
 

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


[Rd] suggestion how to use memcpy in duplicate.c

2010-04-21 Thread Matthew Dowle
From copyVector in duplicate.c :

void copyVector(SEXP s, SEXP t)
{
int i, ns, nt;
nt = LENGTH(t);
ns = LENGTH(s);
switch (TYPEOF(s)) {
...
case INTSXP:
for (i = 0; i  ns; i++)
INTEGER(s)[i] = INTEGER(t)[i % nt];
break;
...

could that be replaced with :

case INTSXP:
for (i=0; ins/nt; i++)
memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char *)DATAPTR(t), 
nt*sizeof(int));
break;

and similar for the other types in copyVector.  This won't help regular 
vector copies, since those seem to be done by the DUPLICATE_ATOMIC_VECTOR 
macro, see next suggestion below, but it should help copyMatrix which calls 
copyVector, scan.c which calls copyVector on three lines, dcf.c (once) and 
dounzip.c (once).

For the DUPLICATE_ATOMIC_VECTOR macro there is already a comment next to it 
:

FIXME: surely memcpy would be faster here?

which seems to refer to the for loop  :

else { \
int __i__; \
type *__fp__ = fun(from), *__tp__ = fun(to); \
for (__i__ = 0; __i__  __n__; __i__++) \
  __tp__[__i__] = __fp__[__i__]; \
  } \

Could that loop be replaced by the following ?

   else { \
   memcpy((char *)DATAPTR(to), (char *)DATAPTR(from), __n__*sizeof(type)); \
   }\

In the data.table package, dogroups.c uses this technique, so the principle 
is tested and works well so far.

Are there any road blocks preventing this change, or is anyone already 
working on it ?  If not then I'll try and test it (on Ubuntu 32bit) and 
submit patch with timings, as before.  Comments/pointers much appreciated.

Matthew

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-21 Thread Simon Urbanek
Matt,

On Apr 21, 2010, at 11:54 AM, Matthew Dowle wrote:

 From copyVector in duplicate.c :
 
 void copyVector(SEXP s, SEXP t)
 {
int i, ns, nt;
nt = LENGTH(t);
ns = LENGTH(s);
switch (TYPEOF(s)) {
 ...
case INTSXP:
for (i = 0; i  ns; i++)
INTEGER(s)[i] = INTEGER(t)[i % nt];
break;
 ...
 
 could that be replaced with :
 
case INTSXP:
for (i=0; ins/nt; i++)
memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char *)DATAPTR(t), 
 nt*sizeof(int));
break;
 

Won't that miss the last incomplete chunk? (and please don't use DATAPTR on 
INTSXP even though the effect is currently the same)

In general it seems that the it depends on nt whether this is efficient or not 
since calls to short memcpy are expensive (very small nt that is).

 I ran some empirical tests to compare memcpy vs for() (x86_64, OS X) and the 
results were encouraging - depending on the size of the copied block the 
difference could be quite big:
tiny block (ca. n = 32 or less) - for() is faster
small block (n ~ 1k) - memcpy is ca. 8x faster
as the size increases the gap closes (presumably due to RAM bandwidth 
limitations) so for n = 512M it is ~30%.

Of course this is contingent on the implementation of memcpy, compiler, 
architecture etc. And will only matter if copying is what you do most of the 
time ...

Cheers,
Simon



 and similar for the other types in copyVector.  This won't help regular 
 vector copies, since those seem to be done by the DUPLICATE_ATOMIC_VECTOR 
 macro, see next suggestion below, but it should help copyMatrix which calls 
 copyVector, scan.c which calls copyVector on three lines, dcf.c (once) and 
 dounzip.c (once).
 
 For the DUPLICATE_ATOMIC_VECTOR macro there is already a comment next to it 
 :
 
FIXME: surely memcpy would be faster here?
 
 which seems to refer to the for loop  :
 
else { \
int __i__; \
type *__fp__ = fun(from), *__tp__ = fun(to); \
for (__i__ = 0; __i__  __n__; __i__++) \
  __tp__[__i__] = __fp__[__i__]; \
  } \
 
 Could that loop be replaced by the following ?
 
   else { \
   memcpy((char *)DATAPTR(to), (char *)DATAPTR(from), __n__*sizeof(type)); \
   }\
 
 In the data.table package, dogroups.c uses this technique, so the principle 
 is tested and works well so far.
 
 Are there any road blocks preventing this change, or is anyone already 
 working on it ?  If not then I'll try and test it (on Ubuntu 32bit) and 
 submit patch with timings, as before.  Comments/pointers much appreciated.
 
 Matthew
 
 __
 R-devel@r-project.org mailing list
 https://stat.ethz.ch/mailman/listinfo/r-devel
 
 

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-21 Thread Seth Falcon

On 4/21/10 10:45 AM, Simon Urbanek wrote:

Won't that miss the last incomplete chunk? (and please don't use
DATAPTR on INTSXP even though the effect is currently the same)

In general it seems that the it depends on nt whether this is
efficient or not since calls to short memcpy are expensive (very
small nt that is).

I ran some empirical tests to compare memcpy vs for() (x86_64, OS X)
and the results were encouraging - depending on the size of the
copied block the difference could be quite big: tiny block (ca. n =
32 or less) - for() is faster small block (n ~ 1k) - memcpy is ca. 8x
faster as the size increases the gap closes (presumably due to RAM
bandwidth limitations) so for n = 512M it is ~30%.




Of course this is contingent on the implementation of memcpy,
compiler, architecture etc. And will only matter if copying is what
you do most of the time ...


Copying of vectors is something that I would expect to happen fairly 
often in many applications of R.


Is for() faster on small blocks by enough that one would want to branch 
based on size?


+ seth

--
Seth Falcon | @sfalcon | http://userprimary.net/

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-21 Thread Romain Francois

Le 21/04/10 17:54, Matthew Dowle a écrit :



From copyVector in duplicate.c :


void copyVector(SEXP s, SEXP t)
{
 int i, ns, nt;
 nt = LENGTH(t);
 ns = LENGTH(s);
 switch (TYPEOF(s)) {
...
 case INTSXP:
 for (i = 0; i  ns; i++)
 INTEGER(s)[i] = INTEGER(t)[i % nt];
 break;
...

could that be replaced with :

 case INTSXP:
 for (i=0; ins/nt; i++)
 memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char *)DATAPTR(t),
nt*sizeof(int));
 break;


or at least with something like this:

int* p_s = INTEGER(s) ;
int* p_t = INTEGER(t) ;
for( i=0 ; i  ns ; i++){
p_s[i] = p_t[i % nt];
}

since expanding the INTEGER macro over and over has a price.


and similar for the other types in copyVector.  This won't help regular
vector copies, since those seem to be done by the DUPLICATE_ATOMIC_VECTOR
macro, see next suggestion below, but it should help copyMatrix which calls
copyVector, scan.c which calls copyVector on three lines, dcf.c (once) and
dounzip.c (once).

For the DUPLICATE_ATOMIC_VECTOR macro there is already a comment next to it
:

 FIXME: surely memcpy would be faster here?

which seems to refer to the for loop  :

 else { \
 int __i__; \
 type *__fp__ = fun(from), *__tp__ = fun(to); \
 for (__i__ = 0; __i__  __n__; __i__++) \
   __tp__[__i__] = __fp__[__i__]; \
   } \

Could that loop be replaced by the following ?

else { \
memcpy((char *)DATAPTR(to), (char *)DATAPTR(from), __n__*sizeof(type)); \
}\

In the data.table package, dogroups.c uses this technique, so the principle
is tested and works well so far.

Are there any road blocks preventing this change, or is anyone already
working on it ?  If not then I'll try and test it (on Ubuntu 32bit) and
submit patch with timings, as before.  Comments/pointers much appreciated.

Matthew

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel





--
Romain Francois
Professional R Enthusiast
+33(0) 6 28 91 30 30
http://romainfrancois.blog.free.fr
|- http://bit.ly/9aKDM9 : embed images in Rd documents
|- http://tr.im/OIXN : raster images and RImageJ
|- http://tr.im/OcQe : Rcpp 0.7.7

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-21 Thread Simon Urbanek

On Apr 21, 2010, at 2:15 PM, Seth Falcon wrote:

 On 4/21/10 10:45 AM, Simon Urbanek wrote:
 Won't that miss the last incomplete chunk? (and please don't use
 DATAPTR on INTSXP even though the effect is currently the same)
 
 In general it seems that the it depends on nt whether this is
 efficient or not since calls to short memcpy are expensive (very
 small nt that is).
 
 I ran some empirical tests to compare memcpy vs for() (x86_64, OS X)
 and the results were encouraging - depending on the size of the
 copied block the difference could be quite big: tiny block (ca. n =
 32 or less) - for() is faster small block (n ~ 1k) - memcpy is ca. 8x
 faster as the size increases the gap closes (presumably due to RAM
 bandwidth limitations) so for n = 512M it is ~30%.
 
 
 Of course this is contingent on the implementation of memcpy,
 compiler, architecture etc. And will only matter if copying is what
 you do most of the time ...
 
 Copying of vectors is something that I would expect to happen fairly often in 
 many applications of R.
 
 Is for() faster on small blocks by enough that one would want to branch based 
 on size?
 

Good question. Given that the branching itself adds overhead possibly not. In 
the best case for() can be ~40% faster (for single-digit n) but that means 
billions of copies to make a difference (since the operation itself is so 
fast). The break-even point on my test machine is n=32 and when I added the 
branching it took 20% hit so I guess it's simply not worth it. The only case 
that may be worth branching is n:1 since that is likely a fairly common use 
(the branching penalty in copy routines is lower than comparing memcpy/for 
implementations since the branching can be done before the outer for loop so 
this may vary case-by-case).

Cheers,
Simon

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-21 Thread Simon Urbanek

On Apr 21, 2010, at 3:32 PM, Romain Francois wrote:

 Le 21/04/10 17:54, Matthew Dowle a écrit :
 
 From copyVector in duplicate.c :
 
 void copyVector(SEXP s, SEXP t)
 {
 int i, ns, nt;
 nt = LENGTH(t);
 ns = LENGTH(s);
 switch (TYPEOF(s)) {
 ...
 case INTSXP:
 for (i = 0; i  ns; i++)
 INTEGER(s)[i] = INTEGER(t)[i % nt];
 break;
 ...
 
 could that be replaced with :
 
 case INTSXP:
 for (i=0; ins/nt; i++)
 memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char *)DATAPTR(t),
 nt*sizeof(int));
 break;
 
 or at least with something like this:
 
 int* p_s = INTEGER(s) ;
 int* p_t = INTEGER(t) ;
 for( i=0 ; i  ns ; i++){
   p_s[i] = p_t[i % nt];
 }
 
 since expanding the INTEGER macro over and over has a price.
 

... in packages, yes, but not in core R.

Cheers,
Simon


 and similar for the other types in copyVector.  This won't help regular
 vector copies, since those seem to be done by the DUPLICATE_ATOMIC_VECTOR
 macro, see next suggestion below, but it should help copyMatrix which calls
 copyVector, scan.c which calls copyVector on three lines, dcf.c (once) and
 dounzip.c (once).
 
 For the DUPLICATE_ATOMIC_VECTOR macro there is already a comment next to it
 :
 
 FIXME: surely memcpy would be faster here?
 
 which seems to refer to the for loop  :
 
 else { \
 int __i__; \
 type *__fp__ = fun(from), *__tp__ = fun(to); \
 for (__i__ = 0; __i__  __n__; __i__++) \
   __tp__[__i__] = __fp__[__i__]; \
   } \
 
 Could that loop be replaced by the following ?
 
else { \
memcpy((char *)DATAPTR(to), (char *)DATAPTR(from), __n__*sizeof(type)); \
}\
 
 In the data.table package, dogroups.c uses this technique, so the principle
 is tested and works well so far.
 
 Are there any road blocks preventing this change, or is anyone already
 working on it ?  If not then I'll try and test it (on Ubuntu 32bit) and
 submit patch with timings, as before.  Comments/pointers much appreciated.
 
 Matthew
 
 __
 R-devel@r-project.org mailing list
 https://stat.ethz.ch/mailman/listinfo/r-devel
 
 
 
 
 -- 
 Romain Francois
 Professional R Enthusiast
 +33(0) 6 28 91 30 30
 http://romainfrancois.blog.free.fr
 |- http://bit.ly/9aKDM9 : embed images in Rd documents
 |- http://tr.im/OIXN : raster images and RImageJ
 |- http://tr.im/OcQe : Rcpp 0.7.7
 
 __
 R-devel@r-project.org mailing list
 https://stat.ethz.ch/mailman/listinfo/r-devel
 
 

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-21 Thread Romain Francois

Le 21/04/10 21:39, Simon Urbanek a écrit :



On Apr 21, 2010, at 3:32 PM, Romain Francois wrote:


Le 21/04/10 17:54, Matthew Dowle a écrit :



 From copyVector in duplicate.c :


void copyVector(SEXP s, SEXP t)
{
 int i, ns, nt;
 nt = LENGTH(t);
 ns = LENGTH(s);
 switch (TYPEOF(s)) {
...
 case INTSXP:
 for (i = 0; i   ns; i++)
 INTEGER(s)[i] = INTEGER(t)[i % nt];
 break;
...

could that be replaced with :

 case INTSXP:
 for (i=0; ins/nt; i++)
 memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char *)DATAPTR(t),
nt*sizeof(int));
 break;


or at least with something like this:

int* p_s = INTEGER(s) ;
int* p_t = INTEGER(t) ;
for( i=0 ; i  ns ; i++){
p_s[i] = p_t[i % nt];
}

since expanding the INTEGER macro over and over has a price.



... in packages, yes, but not in core R.


Hmm. I was not talking about the overhead of the INTEGER function:

int *(INTEGER)(SEXP x) {
if(TYPEOF(x) != INTSXP  TYPEOF(x) != LGLSXP)
error(%s() can only be applied to a '%s', not a '%s',
  INTEGER, integer, type2char(TYPEOF(x)));
return INTEGER(x);
}



but the one related to the macro.

#define INTEGER(x)  ((int *) DATAPTR(x))
#define DATAPTR(x)  (((SEXPREC_ALIGN *) (x)) + 1)

so the loop expands to :

for (i = 0; i   ns; i++)
  ((int *) (((SEXPREC_ALIGN *) (s)) + 1))[i] = ((int *) 
(((SEXPREC_ALIGN *) (t)) + 1))[i % nt];


I still believe grabbing the pointer just once for s and once for t is 
more efficient ...



Cheers,
Simon



and similar for the other types in copyVector.  This won't help regular
vector copies, since those seem to be done by the DUPLICATE_ATOMIC_VECTOR
macro, see next suggestion below, but it should help copyMatrix which calls
copyVector, scan.c which calls copyVector on three lines, dcf.c (once) and
dounzip.c (once).

For the DUPLICATE_ATOMIC_VECTOR macro there is already a comment next to it
:

 FIXME: surely memcpy would be faster here?

which seems to refer to the for loop  :

 else { \
 int __i__; \
 type *__fp__ = fun(from), *__tp__ = fun(to); \
 for (__i__ = 0; __i__   __n__; __i__++) \
   __tp__[__i__] = __fp__[__i__]; \
   } \

Could that loop be replaced by the following ?

else { \
memcpy((char *)DATAPTR(to), (char *)DATAPTR(from), __n__*sizeof(type)); \
}\

In the data.table package, dogroups.c uses this technique, so the principle
is tested and works well so far.

Are there any road blocks preventing this change, or is anyone already
working on it ?  If not then I'll try and test it (on Ubuntu 32bit) and
submit patch with timings, as before.  Comments/pointers much appreciated.

Matthew



--
Romain Francois
Professional R Enthusiast
+33(0) 6 28 91 30 30
http://romainfrancois.blog.free.fr
|- http://bit.ly/9aKDM9 : embed images in Rd documents
|- http://tr.im/OIXN : raster images and RImageJ
|- http://tr.im/OcQe : Rcpp 0.7.7

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-21 Thread Simon Urbanek

On Apr 21, 2010, at 4:13 PM, Romain Francois wrote:

 Le 21/04/10 21:39, Simon Urbanek a écrit :
 
 
 On Apr 21, 2010, at 3:32 PM, Romain Francois wrote:
 
 Le 21/04/10 17:54, Matthew Dowle a écrit :
 
 From copyVector in duplicate.c :
 
 void copyVector(SEXP s, SEXP t)
 {
 int i, ns, nt;
 nt = LENGTH(t);
 ns = LENGTH(s);
 switch (TYPEOF(s)) {
 ...
 case INTSXP:
 for (i = 0; i   ns; i++)
 INTEGER(s)[i] = INTEGER(t)[i % nt];
 break;
 ...
 
 could that be replaced with :
 
 case INTSXP:
 for (i=0; ins/nt; i++)
 memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char *)DATAPTR(t),
 nt*sizeof(int));
 break;
 
 or at least with something like this:
 
 int* p_s = INTEGER(s) ;
 int* p_t = INTEGER(t) ;
 for( i=0 ; i  ns ; i++){
 p_s[i] = p_t[i % nt];
 }
 
 since expanding the INTEGER macro over and over has a price.
 
 
 ... in packages, yes, but not in core R.
 
 Hmm. I was not talking about the overhead of the INTEGER function:
 
 int *(INTEGER)(SEXP x) {
if(TYPEOF(x) != INTSXP  TYPEOF(x) != LGLSXP)
   error(%s() can only be applied to a '%s', not a '%s',
 INTEGER, integer, type2char(TYPEOF(x)));
return INTEGER(x);
 }
 
 
 
 but the one related to the macro.
 
 #define INTEGER(x)((int *) DATAPTR(x))
 #define DATAPTR(x)(((SEXPREC_ALIGN *) (x)) + 1)
 
 so the loop expands to :
 
 for (i = 0; i   ns; i++)
  ((int *) (((SEXPREC_ALIGN *) (s)) + 1))[i] = ((int *) 
 (((SEXPREC_ALIGN *) (t)) + 1))[i % nt];
 
 I still believe grabbing the pointer just once for s and once for t is more 
 efficient ...
 

Nope, since everything involved is loop invariant so the pointer values don't 
change (you'd have to declare s or t volatile to prevent that).

Try using gcc -s and you'll see that the code is the same (depending on the 
version of gcc the order of the first comparison can change so technically the 
INTEGER(x) version can save one add instruction in the degenerate case and be 
faster(!) in old gcc).

Cheers,
Simon



 
 and similar for the other types in copyVector.  This won't help regular
 vector copies, since those seem to be done by the DUPLICATE_ATOMIC_VECTOR
 macro, see next suggestion below, but it should help copyMatrix which calls
 copyVector, scan.c which calls copyVector on three lines, dcf.c (once) and
 dounzip.c (once).
 
 For the DUPLICATE_ATOMIC_VECTOR macro there is already a comment next to it
 :
 
 FIXME: surely memcpy would be faster here?
 
 which seems to refer to the for loop  :
 
 else { \
 int __i__; \
 type *__fp__ = fun(from), *__tp__ = fun(to); \
 for (__i__ = 0; __i__   __n__; __i__++) \
   __tp__[__i__] = __fp__[__i__]; \
   } \
 
 Could that loop be replaced by the following ?
 
else { \
memcpy((char *)DATAPTR(to), (char *)DATAPTR(from), __n__*sizeof(type)); 
 \
}\
 
 In the data.table package, dogroups.c uses this technique, so the principle
 is tested and works well so far.
 
 Are there any road blocks preventing this change, or is anyone already
 working on it ?  If not then I'll try and test it (on Ubuntu 32bit) and
 submit patch with timings, as before.  Comments/pointers much appreciated.
 
 Matthew
 
 
 -- 
 Romain Francois
 Professional R Enthusiast
 +33(0) 6 28 91 30 30
 http://romainfrancois.blog.free.fr
 |- http://bit.ly/9aKDM9 : embed images in Rd documents
 |- http://tr.im/OIXN : raster images and RImageJ
 |- http://tr.im/OcQe : Rcpp 0.7.7
 
 
 

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-21 Thread Simon Urbanek

On Apr 21, 2010, at 4:39 PM, Simon Urbanek wrote:

 
 On Apr 21, 2010, at 4:13 PM, Romain Francois wrote:
 
 Le 21/04/10 21:39, Simon Urbanek a écrit :
 
 
 On Apr 21, 2010, at 3:32 PM, Romain Francois wrote:
 
 Le 21/04/10 17:54, Matthew Dowle a écrit :
 
 From copyVector in duplicate.c :
 
 void copyVector(SEXP s, SEXP t)
 {
int i, ns, nt;
nt = LENGTH(t);
ns = LENGTH(s);
switch (TYPEOF(s)) {
 ...
case INTSXP:
for (i = 0; i   ns; i++)
INTEGER(s)[i] = INTEGER(t)[i % nt];
break;
 ...
 
 could that be replaced with :
 
case INTSXP:
for (i=0; ins/nt; i++)
memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char *)DATAPTR(t),
 nt*sizeof(int));
break;
 
 or at least with something like this:
 
 int* p_s = INTEGER(s) ;
 int* p_t = INTEGER(t) ;
 for( i=0 ; i  ns ; i++){
p_s[i] = p_t[i % nt];
 }
 
 since expanding the INTEGER macro over and over has a price.
 
 
 ... in packages, yes, but not in core R.
 
 Hmm. I was not talking about the overhead of the INTEGER function:
 
 int *(INTEGER)(SEXP x) {
   if(TYPEOF(x) != INTSXP  TYPEOF(x) != LGLSXP)
  error(%s() can only be applied to a '%s', not a '%s',
INTEGER, integer, type2char(TYPEOF(x)));
   return INTEGER(x);
 }
 
 
 
 but the one related to the macro.
 
 #define INTEGER(x)   ((int *) DATAPTR(x))
 #define DATAPTR(x)   (((SEXPREC_ALIGN *) (x)) + 1)
 
 so the loop expands to :
 
 for (i = 0; i   ns; i++)
 ((int *) (((SEXPREC_ALIGN *) (s)) + 1))[i] = ((int *) 
 (((SEXPREC_ALIGN *) (t)) + 1))[i % nt];
 
 I still believe grabbing the pointer just once for s and once for t is more 
 efficient ...
 
 
 Nope, since everything involved is loop invariant so the pointer values don't 
 change (you'd have to declare s or t volatile to prevent that).
 
 Try using gcc -s

Sorry, I meant gcc -S of course.


 and you'll see that the code is the same (depending on the version of gcc the 
 order of the first comparison can change so technically the INTEGER(x) 
 version can save one add instruction in the degenerate case and be faster(!) 
 in old gcc

[the reason being that INTEGER(x) does not need to be evaluated if the loop is 
not entered whereas p_s and p_t are populated unconditionally - smarter 
compilers will notice that p_s/p_t are not used in that case and thus generate 
identical code in both cases]

Cheers,
Simon


 
 
 
 and similar for the other types in copyVector.  This won't help regular
 vector copies, since those seem to be done by the DUPLICATE_ATOMIC_VECTOR
 macro, see next suggestion below, but it should help copyMatrix which 
 calls
 copyVector, scan.c which calls copyVector on three lines, dcf.c (once) and
 dounzip.c (once).
 
 For the DUPLICATE_ATOMIC_VECTOR macro there is already a comment next to 
 it
 :
 
FIXME: surely memcpy would be faster here?
 
 which seems to refer to the for loop  :
 
else { \
int __i__; \
type *__fp__ = fun(from), *__tp__ = fun(to); \
for (__i__ = 0; __i__   __n__; __i__++) \
  __tp__[__i__] = __fp__[__i__]; \
  } \
 
 Could that loop be replaced by the following ?
 
   else { \
   memcpy((char *)DATAPTR(to), (char *)DATAPTR(from), __n__*sizeof(type)); 
 \
   }\
 
 In the data.table package, dogroups.c uses this technique, so the 
 principle
 is tested and works well so far.
 
 Are there any road blocks preventing this change, or is anyone already
 working on it ?  If not then I'll try and test it (on Ubuntu 32bit) and
 submit patch with timings, as before.  Comments/pointers much appreciated.
 
 Matthew
 
 
 -- 
 Romain Francois
 Professional R Enthusiast
 +33(0) 6 28 91 30 30
 http://romainfrancois.blog.free.fr
 |- http://bit.ly/9aKDM9 : embed images in Rd documents
 |- http://tr.im/OIXN : raster images and RImageJ
 |- http://tr.im/OcQe : Rcpp 0.7.7
 
 
 
 
 __
 R-devel@r-project.org mailing list
 https://stat.ethz.ch/mailman/listinfo/r-devel
 
 

__
R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel


Re: [Rd] suggestion how to use memcpy in duplicate.c

2010-04-21 Thread William Dunlap
If I were worried about the time this loop takes,
I would avoid using i%nt.  For the attached C code
compile with gcc 4.3.3 with -O2 I get 
   # INTEGER() in loop
   system.time( r1 - .Call(my_rep1, 1:3, 1e7) )
 user  system elapsed
0.060   0.012   0.071

   # INTEGER() before loop
   system.time( r2 - .Call(my_rep2, 1:3, 1e7) )
 user  system elapsed
0.076   0.008   0.086

   # replace i%src_length in loop with j=0 before loop and
   #if(++j==src_length) j=0 ;
   # in the loop.
   system.time( r3 - .Call(my_rep3, 1:3, 1e7) )
 user  system elapsed
0.024   0.028   0.050
   identical(r1,r2)  identical(r2,r3)
  [1] TRUE

The C code is:
#define USE_RINTERNALS /* pretend we are in the R kernel */
#include R.h
#include Rinternals.h


SEXP my_rep1(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int i,j ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
for(i=0;idest_length;i++) {
INTEGER(s_dest)[i] = INTEGER(s_src)[i % src_length] ;
}
UNPROTECT(1) ;
return s_dest ;
}
SEXP my_rep2(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int *psrc = INTEGER(s_src) ;
int *pdest ;
int i ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
pdest = INTEGER(s_dest) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
/* end of boilerplate */
for(i=0;idest_length;i++) {
pdest[i] = psrc[i % src_length] ;
}
UNPROTECT(1) ;
return s_dest ;
}
SEXP my_rep3(SEXP s_src, SEXP s_dest_length)
{
int src_length = length(s_src) ;
int dest_length = asInteger(s_dest_length) ;
int *psrc = INTEGER(s_src) ;
int *pdest ;
int i,j ;
SEXP s_dest ;
PROTECT(s_dest = allocVector(INTSXP, dest_length)) ;
pdest = INTEGER(s_dest) ;
if(TYPEOF(s_src) != INTSXP) error(src must be integer data) ;
/* end of boilerplate */
for(j=0,i=0;idest_length;i++) {
*pdest++ = psrc[j++] ;
if (j==src_length) {
j = 0 ;
}
}
UNPROTECT(1) ;
return s_dest ;
}

Bill Dunlap
Spotfire, TIBCO Software
wdunlap tibco.com  

 -Original Message-
 From: r-devel-boun...@r-project.org 
 [mailto:r-devel-boun...@r-project.org] On Behalf Of Romain Francois
 Sent: Wednesday, April 21, 2010 12:32 PM
 To: Matthew Dowle
 Cc: r-de...@stat.math.ethz.ch
 Subject: Re: [Rd] suggestion how to use memcpy in duplicate.c
 
 Le 21/04/10 17:54, Matthew Dowle a écrit :
 
  From copyVector in duplicate.c :
 
  void copyVector(SEXP s, SEXP t)
  {
   int i, ns, nt;
   nt = LENGTH(t);
   ns = LENGTH(s);
   switch (TYPEOF(s)) {
  ...
   case INTSXP:
   for (i = 0; i  ns; i++)
   INTEGER(s)[i] = INTEGER(t)[i % nt];
   break;
  ...
 
  could that be replaced with :
 
   case INTSXP:
   for (i=0; ins/nt; i++)
   memcpy((char *)DATAPTR(s)+i*nt*sizeof(int), (char 
 *)DATAPTR(t),
  nt*sizeof(int));
   break;
 
 or at least with something like this:
 
 int* p_s = INTEGER(s) ;
 int* p_t = INTEGER(t) ;
 for( i=0 ; i  ns ; i++){
   p_s[i] = p_t[i % nt];
 }
 
 since expanding the INTEGER macro over and over has a price.
 
  and similar for the other types in copyVector.  This won't 
 help regular
  vector copies, since those seem to be done by the 
 DUPLICATE_ATOMIC_VECTOR
  macro, see next suggestion below, but it should help 
 copyMatrix which calls
  copyVector, scan.c which calls copyVector on three lines, 
 dcf.c (once) and
  dounzip.c (once).
 
  For the DUPLICATE_ATOMIC_VECTOR macro there is already a 
 comment next to it
  :
 
   FIXME: surely memcpy would be faster here?
 
  which seems to refer to the for loop  :
 
   else { \
   int __i__; \
   type *__fp__ = fun(from), *__tp__ = fun(to); \
   for (__i__ = 0; __i__  __n__; __i__++) \
 __tp__[__i__] = __fp__[__i__]; \
 } \
 
  Could that loop be replaced by the following ?
 
  else { \
  memcpy((char *)DATAPTR(to), (char *)DATAPTR(from), 
 __n__*sizeof(type)); \
  }\
 
  In the data.table package, dogroups.c uses this technique, 
 so the principle
  is tested and works well so far.
 
  Are there any road blocks preventing this change, or is 
 anyone already
  working on it ?  If not then I'll try and test it (on 
 Ubuntu 32bit) and
  submit patch with timings, as before.  Comments/pointers 
 much appreciated.
 
  Matthew
 
  __
  R-devel@r-project.org mailing list
  https://stat.ethz.ch/mailman/listinfo/r-devel
 
 
 
 
 -- 
 Romain Francois
 Professional R Enthusiast
 +33(0) 6 28 91 30 30
 http://romainfrancois.blog.free.fr
 |- http://bit.ly/9aKDM9 : embed images in Rd documents
 |- http://tr.im/OIXN : raster images and RImageJ
 |- http://tr.im/OcQe : Rcpp