Re: [Jchat] quickest looping construct

Joe Bogner Thu, 05 Feb 2015 20:09:03 -0800

I decided to take the tcc experiment a few steps further and
implemented a multi-threaded version of infix which is JIT compiled
into the J session


The C code is far from perfect. Single threaded it takes 3.716 seconds
to process a 9 character overlapping infix on a 242 million character
array.

J can still natively complete it in in 2 seconds with 9[\ txt

Using 4 threads, it completes in 2.87 seconds (22% improvement over
single threaded, 43% slower than J's single threaded with MS optimized
memcpy) and uses 100% cpu (good)

I learned along the way that malloc will fail with allocations greater
than 2gb in the dll loaded into J. Not sure why, but I was able to
work around it by using VirtualAlloc

I realize this is a deep dive, but figured others may be interested
and it seems chat worthy. It reinforces that multiple threads aren't
always needed, however it proves that we can do multi-threaded code
from within J which may be useful in other applications.

The next task is to do a multi-threaded overlapped nub count.

Even though J can do a 9 character overlapping prefix in roughly 2
seconds, it takes 49 seconds to do a nub of that.

$ 9[\txt
242193526 9

6!:2 '# ~. 9[\txt'
49.4751

Since the multi-threaded infix runs similarly to J's native, it seems
I could write 'special code' to do a nub count over the infixes
without saving each one.

Raul's block approach takes about the same time ~ 50 seconds to complete.

The nub count infix won't be actually used for anything useful other
than for learning.

Here is the multi-threaded infix in J:

testInfixThreaded_code=: 0 : 0
#include <stdio.h>
#include <windows.h>
#include <process.h>

#define LONG long long

const int THREADS = 4;
void free(void* ptr) {
 //free wont work since we use VirtualAlloc
 //free(ptr);
 VirtualFree(ptr, 0, MEM_RELEASE);
}


typedef struct ThreadData ThreadData;
struct ThreadData {
       void *mem;
       int threadId;
       LONG d;
       int infixLen;
       void *newMem;
       LONG len;
};

int  infixThreaded(void *threadData) {
    ThreadData *args = (ThreadData*)threadData;
    LONG infixLen = args->infixLen;
    LONG blockSize = (args->d+(THREADS-1))/THREADS;
    LONG d = args->d;

    char *x=(char*)args->newMem;
    char *y=args->mem;
    LONG ctr = 0;

    for(LONG i=0;i<args->threadId*blockSize;i++) {
        x+=infixLen;
        y+=1;
        ctr+=1;
    }


    for(LONG i=0;i<=blockSize && ctr<=d;i++) {
        memcpy(x,y,infixLen);
        //printf("start: %d on thread %d, blockSize : %d\n", ctr,
args->threadId, blockSize);
        x+=infixLen;
        y+=1;
        ctr++;
    }

    return 0;
}


int infix(void* pmem, LONG len, LONG infixLen, LONG *out) {
    LONG allocLen = len*sizeof(char)*infixLen;

    //malloc fails on larger than 2gb allocations
    //char *newMem = (char*)malloc(allocLen);
    char *newMem = VirtualAlloc(0, allocLen, MEM_COMMIT | MEM_RESERVE,
PAGE_READWRITE);
    if (NULL == newMem) {
       printf("could not allocate\n");
       fflush(stdout);
       return -1;
    }

    char *mem = (char*)pmem;
    LONG d = 1+len-infixLen;


    //skip threads for small args
    if (d < 1000000) {
        char *x=newMem;
        char *y=mem;
        for(int i=0;i<d;i++) {
            memcpy(x,y,infixLen);
            x+=infixLen;
            y+=1;
        }
    } else {

        HANDLE* threads = (HANDLE*)malloc(sizeof(HANDLE)*THREADS);
        ThreadData **args = (ThreadData**)malloc(sizeof(ThreadData)*THREADS);
        for(int i=0;i<THREADS;i++) {
            ThreadData *arg = (ThreadData*)malloc(sizeof(ThreadData));
            arg->mem = pmem;
            arg->threadId = i;
            arg->d = d;
            arg->infixLen = infixLen;
            arg->newMem = newMem;
            arg->len = len;
            args[i] = arg;
            threads[i] = _beginthreadex(NULL, 0, &infixThreaded, arg, 0, NULL);
        }
        WaitForMultipleObjects(THREADS, threads, 1, INFINITE);
        for(int i = 0; i<THREADS; i++) { CloseHandle(threads[i]);
free(args[i]); }
        free(threads);
    }

    printf("done\n");

    fflush(stdout);
    *out = (d*infixLen);
    return newMem;
}

)

testinfixThreaded=: 3 : 0
txt=.y
addr=. mema 4*(#txt)
txt memw addr,0,(#txt)
infixSize=.9
tcc=:init_tcc_''
compile_tcc_ < testInfixThreaded_code
ret=:execIntOutInt_tcc_ 'infix';addr;(#txt);infixSize
NB. smoutput ret
'memPtr size'=: ret
output =: memr memPtr,0,size
execInt_tcc_ 'free';memPtr;0
smoutput 'done'
free_tcc_''
ret
NB. output
)

On Wed, Feb 4, 2015 at 1:43 PM, Joe Bogner <[email protected]> wrote:
> Raul,
>
> That is an elegant answer and one I'll add to my notes to reference in
> the future.
>
> It takes approximately 50 seconds to process 240 million 9 character
> infixes using 50 million character blocks.
>
> As a comparison, I wrote an integration to TCC which enables
> generating and calling C code from J
>
> https://github.com/joebo/lang-lab/blob/master/j/tcc.ijs
>
> It could use some cleanup and it's not tested on linux or win32
>
> It includes everything to build the stub dll and then some examples.
>
> Here's an example of creating infixes on a character array:
>
> testInfix_code=: 0 : 0
> #include <stdio.h>
>
> void free2(long ptr) {
>  free(ptr);
> }
>
> long infix(long pmem, long len, int infixLen, long long *out) {
>     long long allocLen = len*sizeof(char)*infixLen;
>     char *newMem = (char*)malloc(allocLen);
>     char *mem = (char*)pmem;
>
>     printf("len: %d\n", len);
>     fflush(stdout);
>     long long offset = 0;
>     long long idx = 0;
>
>     for(long long i=0;i<len && ((len-offset) >= infixLen);i++) {
>         for(long q=0;q<infixLen;q++) {
>             newMem[idx++] = mem[q+offset];
>         }
>         offset++;
>     }
>
>     *out = idx;
>     return newMem;
> }
> )
>
> To run this:
>
> infix=: 3 : 0
> txt=.y
> addr=. mema 4*(#txt)
> txt memw addr,0,(#txt)
> infixSize=.9
> tcc=:init_tcc_''
> compile_tcc_ < testInfix_code
> ret=.execIntOutInt_tcc_ 'infix';addr;(#txt);infixSize
> smoutput ret
> 'memPtr size'=: ret
> output =: memr memPtr,0,size
> execInt_tcc_ 'free';memPtr;0
> free_tcc_''
> output
> )
>
> testInfix =: 3 : 0
> txt=. 'abcdefghijklmnopqrstuvwxyz'
> infix txt
> )
>
>   _9[\ testInfix''
> len: 26
> +--------+---+
> |15900816|162|
> +--------+---+
> abcdefghi
> bcdefghij
> cdefghijk
> defghijkl
> efghijklm
> fghijklmn
> ghijklmno
> hijklmnop
> ijklmnopq
> jklmnopqr
> klmnopqrs
> lmnopqrst
> mnopqrstu
> nopqrstuv
> opqrstuvw
> pqrstuvwx
> qrstuvwxy
> rstuvwxyz
>
>
>
> I ended up running into problems with sizes greater than 75 million,
> probably due to a long size issue.
>
> Funny enough, it's actually slower than J's version by a factor of 4.
>
>    6!:2 'b1=:_9[\(infix 50e6{. txt)'
> 1.82504
>    6!:2 'b2=:9[\ 50e6{. txt'
> 0.431845
>    b1-:b2
> 1
>
>
>
> This could be a combination of TCC's ability to optimize code or it
> could be J's algorithm is implemented more efficiently than mine
> (probably both)
>
> The actual JIT operation only takes about 5/100 of a second
>
> I'm sure I could profile it and speed it up, but I've already spent
> considerably more time than desired on this "challenge".
>
> On the upside, we now have a prototype for generating C code from J
>
>
>
> On Tue, Feb 3, 2015 at 8:05 PM, Raul Miller <[email protected]> wrote:
>> Here's how I'd implement the overlapped block thing.
>>
>> overlapblock=:2 :0
>> :
>>   'size overlap init'=. x
>>   bs=. size*i.>.(#y)%{.size
>>   bl=. bs-~(#y)<.bs+size+overlap
>>   accum=. init
>>   for_sl. bs,.bl do.
>>     accum=. accum u v ((+i.)/sl){y
>>   end.
>> )
>>
>>    #(10;9;i.0 0) ~.@, overlapblock (~.@(9&(]\))) 30$'abcdefghij'
>> 10
>>
>> Of course change the value for x and y to match what you want to do...
>> And maybe recode to swap u and v, if that looks better.
>>
>> If that's too slow, it might be better to gather intermediate results
>> from each block and then post-process them in a single pass?
>>
>> Thanks,
>>
>> --
>> Raul
>>
>> On Tue, Feb 3, 2015 at 6:24 PM, Joe Bogner <[email protected]> wrote:
>>> On Tue, Feb 3, 2015 at 6:17 PM, Raul Miller <[email protected]> wrote:
>>>> So you are working with non-overlapping infixes?
>>>>
>>>>    _9[\'abcdefghijklmnopqrstuvwxyz'
>>>> abcdefghi
>>>> jklmnopqr
>>>> stuvwxyz
>>>>
>>>
>>>
>>> Sorry, no that was a typo. I can use non-overlapping to split up into
>>> blocks, but I need overlapping for the string.
>>>
>>> It is overlapping infixes as I am calculating the unique 9 character 
>>> substrings
>>>
>>> 9[\abc
>>> abcdefghi
>>> bcdefghij
>>> cdefghijk
>>> defghijkl
>>> efghijklm
>>> fghijklmn
>>> ghijklmno
>>> hijklmnop
>>> ijklmnopq
>>> jklmnopqr
>>> klmnopqrs
>>> lmnopqrst
>>> mnopqrstu
>>> nopqrstuv
>>> opqrstuvw
>>> pqrstuvwx
>>> qrstuvwxy
>>> rstuvwxyz
>>>
>>> # ~. 9[\abc
>>> 18
>>> ----------------------------------------------------------------------
>>> For information about J forums see http://www.jsoftware.com/forums.htm
>> ----------------------------------------------------------------------
>> For information about J forums see http://www.jsoftware.com/forums.htm
----------------------------------------------------------------------
For information about J forums see http://www.jsoftware.com/forums.htm

Re: [Jchat] quickest looping construct

Reply via email to