Addition and mul can be counted as 1 operation, but not trigonometric
fct. A roungh estimate would be they are worth 50 operation(order of
magnitude right on CPU) So this give 17k. In the same ball park for
this estimation:)

Can you just run this function on many inputs in parallel? It would be
easier to parallelize an outer loop then this.

Fred

On Mon, Mar 17, 2014 at 11:21 AM, Jason Moore <[email protected]> wrote:
> I'm still digesting what Matthew and Max wrote. Lots of new words for me :)
> But here is a simple example taken from C code we generate for a simple 2
> link pendulum.
>
> First the C code with SymPy's CSE expressions automatically generated:
>
> #include <math.h>
> #include "multibody_system_c.h"
>
> void mass_forcing(double constants[6], // constants = [g, m0, l0, m1, l1,
> m2]
>                   double coordinates[3], // coordinates = [q0, q1, q2]
>                   double speeds[3], // speeds = [u0, u1, u2]
>                   double mass_matrix[36], // computed
>                   double forcing_vector[6]) // computed
> {
>     // common subexpressions
>     double z_0 = coordinates[1];
>     double z_1 = sin(z_0);
>     double z_2 = constants[2]*z_1;
>     double z_3 = -constants[3]*z_2 - constants[5]*z_2;
>     double z_4 = coordinates[2];
>     double z_5 = sin(z_4);
>     double z_6 = -constants[4]*constants[5]*z_5;
>     double z_7 = pow(constants[2], 2);
>     double z_8 = constants[2]*constants[4]*constants[5];
>     double z_9 = cos(z_0);
>     double z_10 = cos(z_4);
>     double z_11 = z_8*(z_1*z_5 + z_10*z_9);
>     double z_12 = speeds[1];
>     double z_13 = speeds[2];
>     double z_14 = pow(z_12, 2);
>     double z_15 = constants[2]*z_14*z_9;
>     double z_16 = pow(z_13, 2);
>     double z_17 = constants[4]*constants[5]*z_10;
>     double z_18 = constants[0]*constants[2]*z_9;
>     double z_19 = z_5*z_9;
>     double z_20 = z_1*z_10;
>
>     // mass matrix
>     mass_matrix[0] = 1;
>     mass_matrix[1] = 0;
>     mass_matrix[2] = 0;
>     mass_matrix[3] = 0;
>     mass_matrix[4] = 0;
>     mass_matrix[5] = 0;
>     mass_matrix[6] = 0;
>     mass_matrix[7] = 1;
>     mass_matrix[8] = 0;
>     mass_matrix[9] = 0;
>     mass_matrix[10] = 0;
>     mass_matrix[11] = 0;
>     mass_matrix[12] = 0;
>     mass_matrix[13] = 0;
>     mass_matrix[14] = 1;
>     mass_matrix[15] = 0;
>     mass_matrix[16] = 0;
>     mass_matrix[17] = 0;
>     mass_matrix[18] = 0;
>     mass_matrix[19] = 0;
>     mass_matrix[20] = 0;
>     mass_matrix[21] = constants[1] + constants[3] + constants[5];
>     mass_matrix[22] = z_3;
>     mass_matrix[23] = z_6;
>     mass_matrix[24] = 0;
>     mass_matrix[25] = 0;
>     mass_matrix[26] = 0;
>     mass_matrix[27] = z_3;
>     mass_matrix[28] = constants[3]*z_7 + constants[5]*z_7;
>     mass_matrix[29] = z_11;
>     mass_matrix[30] = 0;
>     mass_matrix[31] = 0;
>     mass_matrix[32] = 0;
>     mass_matrix[33] = z_6;
>     mass_matrix[34] = z_11;
>     mass_matrix[35] = pow(constants[4], 2)*constants[5];
>
>     // forcing vector
>     forcing_vector[0] = speeds[0];
>     forcing_vector[1] = z_12;
>     forcing_vector[2] = z_13;
>     forcing_vector[3] = constants[3]*z_15 + constants[5]*z_15 + z_16*z_17;
>     forcing_vector[4] = -constants[3]*z_18 - constants[5]*z_18 +
> z_16*z_8*(z_19 - z_20);
>     forcing_vector[5] = -constants[0]*z_17 + z_14*z_8*(-z_19 + z_20);
> }
>
>
> Now I manually group these expression evaluations into "stacks", i.e. those
> calls which could happen in parallel (there is of course a bit more
> complicated dependency graph you can draw so that you maximize the time that
> your cores have a task).
>
> // These are not computations but just value assignments.
> z_0 = coordinates[1];
> z_4 = coordinates[2];
> z_12 = speeds[1];
> z_13 = speeds[2];
> mass_matrix[0] = 1;
> mass_matrix[1] = 0;
> mass_matrix[2] = 0;
> mass_matrix[3] = 0;
> mass_matrix[4] = 0;
> mass_matrix[5] = 0;
> mass_matrix[6] = 0;
> mass_matrix[7] = 1;
> mass_matrix[8] = 0;
> mass_matrix[9] = 0;
> mass_matrix[10] = 0;
> mass_matrix[11] = 0;
> mass_matrix[12] = 0;
> mass_matrix[13] = 0;
> mass_matrix[14] = 1;
> mass_matrix[15] = 0;
> mass_matrix[16] = 0;
> mass_matrix[17] = 0;
> mass_matrix[18] = 0;
> mass_matrix[19] = 0;
> mass_matrix[20] = 0;
> mass_matrix[24] = 0;
> mass_matrix[25] = 0;
> mass_matrix[26] = 0;
> mass_matrix[30] = 0;
> mass_matrix[31] = 0;
> mass_matrix[32] = 0;
> forcing_vector[0] = speeds[0];
> forcing_vector[1] = z_12;
> forcing_vector[2] = z_13;
>
> // These are computations that involve the initial values passed into the
> // function, i.e. stack #1.
> z_7 = pow(constants[2], 2);
> z_8 = constants[2]*constants[4]*constants[5];
> z_14 = pow(z_12, 2);
> z_16 = pow(z_13, 2);
> mass_matrix[21] = constants[1] + constants[3] + constants[5];
> mass_matrix[35] = pow(constants[4], 2)*constants[5];
>
> // Stack #2
> z_1 = sin(z_0);
> z_5 = sin(z_4);
> z_9 = cos(z_0);
> z_10 = cos(z_4);
> z_2 = constants[2]*z_1;
> mass_matrix[28] = constants[3]*z_7 + constants[5]*z_7;
>
> // Stack #3
> z_3 = -constants[3]*z_2 - constants[5]*z_2;
> z_6 = -constants[4]*constants[5]*z_5;
> z_11 = z_8*(z_1*z_5 + z_10*z_9);
> z_15 = constants[2]*z_14*z_9;
> z_17 = constants[4]*constants[5]*z_10;
> z_18 = constants[0]*constants[2]*z_9;
> z_19 = z_5*z_9;
> z_20 = z_1*z_10;
>
> // Stack #4
> mass_matrix[22] = z_3;
> mass_matrix[23] = z_6;
> mass_matrix[27] = z_3;
> mass_matrix[29] = z_11;
> mass_matrix[33] = z_6;
> mass_matrix[34] = z_11;
> forcing_vector[3] = constants[3]*z_15 + constants[5]*z_15 + z_16*z_17;
> forcing_vector[4] = -constants[3]*z_18 - constants[5]*z_18 + z_16*z_8*(z_19
> - z_20);
> forcing_vector[5] = -constants[0]*z_17 + z_14*z_8*(-z_19 + z_20);
>
>
> So this simplified example of the dependencies in the CSE's shows that if I
> had enough cores available I could parallelize each stack, potentially
> increasing the execution speed. So instead of 31 evaluations, you could have
> 4 evaluations in parallel, ideally a 7.75x speedup. For more complicated
> problems, there could be thousands and thousands of these CSEs, but I'll
> need to generate their dependencies with code to see if things stack this
> nicely for the big problems. I suspect the dependency chain could be such
> that the higher number stacks could have hundreds of expressions whereas the
> lower stacks have fewer, or vice versa.
>
> How do I generate a DAG for long expressions in SymPy? Is this part of the
> internal architecture of SymPy expressions? I don't understand how the cse()
> code works yet either, but it seems like this information should be computed
> already. I just need to visualize the graph for some of our bigger problems.
>
> Also, the for the number of scalars and number of operations in each. Here
> is an bigger problem with 2000 or so CSE's:
>
> https://github.com/moorepants/dissertation/blob/master/src/extensions/arms/ArmsDynamics.c
>
> This problem has 12 scalars that have 2000+ CSE's and there are 5840
> additions and subtractions, 9847 multiplications and divisions, 14 cosines,
> and 14 sines. So roughly 1300 operations per scalar.
>
>
> Jason
> moorepants.info
> +01 530-601-9791
>
>
> On Mon, Mar 17, 2014 at 12:06 AM, Matthew Rocklin <[email protected]>
> wrote:
>>
>> Response from Max follows (for some reason he was getting bounced by the
>> mailing list).
>>
>>
>> On Sun, Mar 16, 2014 at 8:55 PM, Max Hutchinson <[email protected]>
>> wrote:
>>>
>>> tl;dr it depends on the DAG, but improved ILP is is likely possible (if
>>> difficult) and there could be room for multi-core parallelism as well.
>>>
>>> As I understand it, we're talking about a long computation applied to
>>> short input vectors.  If the computation can be applied to many input
>>> vectors at once, independent of each other, then all levels of parallelism
>>> (multiple instructions, multiple cores, multiple sockets, multiple nodes)
>>> can be used.  This is data-parallelism, which is great! However, it doesn't
>>> sound like this is the case.
>>>
>>> It sounds like you're thinking of building a DAG of these CSEs and trying
>>> to use task-parallelism over independent parts of it (automatically using
>>> sympy or theano or what have you).  The tension here is going to be between
>>> locality and parallelism: how much compute hardware can you spread your data
>>> across without losing the nice cache performance that your small input
>>> vectors gain you.  I'd bet that going off-socket is way too wide.  Modern
>>> multi-core architectures have core-local L2 and L1 caches, so if your input
>>> data fits nicely into L2 and your DAG isn't really local, you probably won't
>>> get anything out of multiple-cores.  Your last stand is single-core
>>> parallelism (instruction-level parallelism), which sympy et al may or may
>>> not be well equipped to influence.
>>>
>>> To start, I'd recommend that you take a look at your DAGs and try to
>>> figure out how large the independent chunks are.  Then, estimate the amount
>>> of instruction level parallelism when you run in 'serial' (which you can do
>>> with flop-counting).  If your demonstrated ILP is less than your independent
>>> chunk size, then at least improved ILP should be possible.  Automatically
>>> splitting up these DAGs and expressing them in a low-level enough way to
>>> affect ILP is a considerable task, though.
>>>
>>> To see if multi-core parallelism is worth it, you need to estimate how
>>> many extra L3 loads you'd incur by spreading your data of multiple L2s.  I
>>> don't have great advice for that, maybe someone else here does.  The good
>>> news is that if your problem has this level of locality, then you can
>>> probably get away with emitting C code with pthreads or even openmp.  Just
>>> bear in mind the thread creation/annihilation overhead (standing
>>> thread-pools are your friend) and pin them to cores.
>>>
>>> Good luck,
>>> Max
>>
>> --
>> You received this message because you are subscribed to the Google Groups
>> "sympy" group.
>> To unsubscribe from this group and stop receiving emails from it, send an
>> email to [email protected].
>> To post to this group, send email to [email protected].
>> Visit this group at http://groups.google.com/group/sympy.
>> To view this discussion on the web visit
>> https://groups.google.com/d/msgid/sympy/CAJ8oX-Hc2y9C7FO07kkeraDAv7NNRGPkMJ2DvjgF2Oq7PzeS6g%40mail.gmail.com.
>>
>> For more options, visit https://groups.google.com/d/optout.
>
>
> --
> You received this message because you are subscribed to the Google Groups
> "sympy" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to [email protected].
> To post to this group, send email to [email protected].
> Visit this group at http://groups.google.com/group/sympy.
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/sympy/CAP7f1AggyuHprs7H%2B_PSVP37kG%2BWQv%3DuzSsBpiExh9U4HQe0dA%40mail.gmail.com.
>
> For more options, visit https://groups.google.com/d/optout.

-- 
You received this message because you are subscribed to the Google Groups 
"sympy" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
Visit this group at http://groups.google.com/group/sympy.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/sympy/CADKKbthqjD8sgODmDNyFQjEXN2uTYpZv-juQ5vDn9LGm3qTerg%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to