# Say I have a list of tasks, eg tasks i=1:n
# For each task I want to call a function foo
# that depends on that task and some fixed data
# I have many types of fixed data: eg, arrays, dictionaries, integers, etc

# Imagine the data comes from eg loading a file based on user input,
# so we can't hard code the data into the function foo 
# although it's constant during program execution

# If I were doing this in serial, I'd do the following

type MyData
myint
mydict
myarray
end

function foo(task,data::MyData)
data.myint + data.myarray[data.mydict[task]]
end

n = 10
const data = MyData(rand(),Dict(1:n,randperm(n)),randperm(n))

results = zeros(n)
for i = 1:n
results[i] = foo(i,data)
end

# What's the right way to do this in parallel? Here are a number of ideas
# To use @parallel or pmap, we have to first copy all the code and data 
everywhere
# I'd like to avoid that, since the data is huge (10 - 100 GB)

@everywhere begin
type MyData
myint
mydict
myarray
end

function foo(task,data::MyData)
data.myint + data.myarray[data.mydict[task]]
end

n = 10
const data = MyData(rand(),Dict(1:n,randperm(n)),randperm(n))
end

## @parallel
results = zeros(n)
@parallel for i = 1:n
results[i] = foo(i,data)
end

## pmap
@everywhere foo(task) = foo(task,data)
results = pmap(foo,1:n)

# To avoid copying data, I can make myarray a shared array
# In that case, I don't want to use @everywhere to put data on each 
processor
# since that would reinstantiate the shared array.
# My current solution is to rewrite my data structure to *not* include 
myarray,
# and pass the array to the function foo separately.
# But the code gets much less pretty as I tear apart my data structure,
# especially if I have a large number of shared arrays. 
# Is there a way for me to avoid this while using shared memory?
# really, I'd like to be able to define my own shared memory data types...

@everywhere begin
type MySmallerData
myint
mydict
end

function foo(task,data::MySmallerData,myarray::SharedArray)
data.myint + myarray[data.mydict[task]]
end

n = 10
const data = MySmallerData(rand(),Dict(1:n,randperm(n)))
end

myarray = SharedArray(randperm(n))

## @parallel
results = zeros(n)
@parallel for i = 1:n
results[i] = foo(i,data,myarray)
end

## pmap
@everywhere foo(task) = foo(task,data,myarray)
results = pmap(foo,1:n)

# Finally, what can I do to avoid copying mydict to each processor?
# Is there a way to use shared memory for it?
# Once again, I'd really like to be able to define my own shared memory 
data types...

Reply via email to