Dear nouiz,

Thank you for your reply. I managed to run the function and the result is 
below. Do you have any suspitions? It does look if its allocating resources 
for intermediate values and preallocation might help, but I am unsure how 
to do this in theano.


HostFromGpu [id A] ''   153
 |GpuElemwise{Composite{(((i0 * (maximum(i1, i2) / Composite{((i0 + i1) + 
i2)}(maximum(i1, i2), maximum(i3, i2), maximum(i4, i2)))) + (i5 * 
(maximum(i3, i2) / Composite{((i0 + i1) + i2)}(maximum(i1, i2), maximum(i3, 
i2), maximum(i4, i2))))) + (i6 * (maximum(i4, i2) / Composite{((i0 + i1) + 
i2)}(maximum(i1, i2), maximum(i3, i2), maximum(i4, i2)))))},no_inplace} [id 
B] ''   149
   |GpuElemwise{Composite{((i0 + i1) + i2)}}[(0, 1)] [id C] ''   145
   | |GpuElemwise{Mul}[(0, 1)] [id D] ''   122
   | | |CudaNdarrayConstant{[[ 120.]]} [id E]
   | | |GpuDimShuffle{0,x} [id F] ''   112
   | |   |GpuCAReduce{add}{0,1} [id G] ''   100
   | |     |GpuElemwise{Composite{(i0 * (i1 * i2))}}[(0, 2)] [id H] ''   90
   | |       |<CudaNdarrayType(float32, matrix)> [id I]
   | |       |<CudaNdarrayType(float32, matrix)> [id J]
   | |       |GpuReshape{2} [id K] ''   82
   | |         |GpuAlloc [id L] ''   56
   | |         | |GpuElemwise{TrueDiv}[(0, 0)] [id M] ''   47
   | |         | | |GpuElemwise{maximum,no_inplace} [id N] ''   34
   | |         | | | |GpuDimShuffle{1,0} [id O] ''   22
   | |         | | | | |<CudaNdarrayType(float32, matrix)> [id P]
   | |         | | | |CudaNdarrayConstant{[[ 0.]]} [id Q]
   | |         | | |GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} [id 
R] ''   39
   | |         | |   |GpuElemwise{maximum,no_inplace} [id N] ''   34
   | |         | |   |GpuElemwise{maximum,no_inplace} [id S] ''   32
   | |         | |   | |GpuDimShuffle{1,0} [id T] ''   18
   | |         | |   | | |<CudaNdarrayType(float32, matrix)> [id U]
   | |         | |   | |CudaNdarrayConstant{[[ 0.]]} [id Q]
   | |         | |   |GpuElemwise{maximum,no_inplace} [id V] ''   33
   | |         | |     |GpuDimShuffle{1,0} [id W] ''   19
   | |         | |     | |<CudaNdarrayType(float32, matrix)> [id X]
   | |         | |     |CudaNdarrayConstant{[[ 0.]]} [id Q]
   | |         | |Shape_i{0} [id Y] ''   3
   | |         | | |<CudaNdarrayType(float32, matrix)> [id J]
   | |         | |TensorConstant{1} [id Z]
   | |         | |Shape_i{1} [id BA] ''   24
   | |         | | |<CudaNdarrayType(float32, matrix)> [id P]
   | |         | |Shape_i{0} [id BB] ''   23
   | |         |   |<CudaNdarrayType(float32, matrix)> [id P]
   | |         |MakeVector{dtype='int64'} [id BC] ''   74
   | |           |Elemwise{Mul}[(0, 0)] [id BD] ''   64
   | |           | |Shape_i{1} [id BA] ''   24
   | |           | |Shape_i{0} [id Y] ''   3
   | |           |Shape_i{0} [id BB] ''   23
   | |GpuElemwise{Composite{(((i0 * i1 * maximum(i2, i3)) / (maximum(i2, 
i3) + maximum(i4, i3))) + ((i5 * i6 * maximum(i4, i3)) / (maximum(i2, i3) + 
maximum(i4, i3))))},no_inplace} [id BE] ''   130
   | | |CudaNdarrayConstant{[[ 600.]]} [id BF]
   | | |GpuDimShuffle{0,x} [id BG] ''   124
   | | | |GpuCAReduce{add}{0,1} [id BH] ''   114
   | | |   |GpuElemwise{Mul}[(0, 1)] [id BI] ''   102
   | | |     |<CudaNdarrayType(float32, matrix)> [id I]
   | | |     |GpuElemwise{Mul}[(0, 1)] [id BJ] ''   89
   | | |       |<CudaNdarrayType(float32, matrix)> [id J]
   | | |       |GpuReshape{2} [id BK] ''   81
   | | |         |GpuAlloc [id BL] ''   53
   | | |         | |GpuElemwise{TrueDiv}[(0, 0)] [id BM] ''   45
   | | |         | | |GpuElemwise{maximum,no_inplace} [id S] ''   32
   | | |         | | |GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} 
[id R] ''   39
   | | |         | |Shape_i{0} [id Y] ''   3
   | | |         | |TensorConstant{1} [id Z]
   | | |         | |Shape_i{1} [id BN] ''   17
   | | |         | | |<CudaNdarrayType(float32, matrix)> [id U]
   | | |         | |Shape_i{0} [id BO] ''   16
   | | |         |   |<CudaNdarrayType(float32, matrix)> [id U]
   | | |         |MakeVector{dtype='int64'} [id BP] ''   72
   | | |           |Elemwise{Mul}[(0, 0)] [id BQ] ''   62
   | | |           | |Shape_i{1} [id BN] ''   17
   | | |           | |Shape_i{0} [id Y] ''   3
   | | |           |Shape_i{0} [id BO] ''   16
   | | |<CudaNdarrayType(float32, matrix)> [id BR]
   | | |CudaNdarrayConstant{[[ 0.]]} [id Q]
   | | |<CudaNdarrayType(float32, matrix)> [id BS]
   | | |CudaNdarrayConstant{[[-120.]]} [id BT]
   | | |GpuDimShuffle{0,x} [id BU] ''   111
   | |   |GpuCAReduce{add}{0,1} [id BV] ''   99
   | |     |GpuElemwise{Mul}[(0, 1)] [id BJ] ''   89
   | |GpuElemwise{Composite{((((i0 * i1 * maximum(i2, i3)) / Composite{((i0 
+ i1) + i2)}(maximum(i2, i3), maximum(i4, i3), maximum(i5, i3))) + (i6 * 
(maximum(i4, i3) / Composite{((i0 + i1) + i2)}(maximum(i2, i3), maximum(i4, 
i3), maximum(i5, i3))))) + ((i7 * i8 * maximum(i5, i3)) / Composite{((i0 + 
i1) + i2)}(maximum(i2, i3), maximum(i4, i3), maximum(i5, 
i3))))},no_inplace} [id BW] ''   141
   |   |CudaNdarrayConstant{[[ 360.]]} [id BX]
   |   |GpuDimShuffle{0,x} [id BY] ''   136
   |   | |GpuCAReduce{add}{0,1} [id BZ] ''   128
   |   |   |GpuElemwise{Mul}[(0, 1)] [id CA] ''   121
   |   |     |<CudaNdarrayType(float32, matrix)> [id I]
   |   |     |GpuElemwise{Mul}[(0, 1)] [id CB] ''   103
   |   |       |<CudaNdarrayType(float32, matrix)> [id J]
   |   |       |GpuReshape{2} [id CC] ''   91
   |   |         |GpuAlloc [id CD] ''   73
   |   |         | |GpuElemwise{TrueDiv}[(0, 0)] [id CE] ''   63
   |   |         | | |GpuElemwise{maximum,no_inplace} [id V] ''   33
   |   |         | | |GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} 
[id R] ''   39
   |   |         | |Shape_i{0} [id Y] ''   3
   |   |         | |TensorConstant{1} [id Z]
   |   |         | |Shape_i{1} [id CF] ''   21
   |   |         | | |<CudaNdarrayType(float32, matrix)> [id X]
   |   |         | |Shape_i{0} [id CG] ''   20
   |   |         |   |<CudaNdarrayType(float32, matrix)> [id X]
   |   |         |MakeVector{dtype='int64'} [id CH] ''   83
   |   |           |Elemwise{Mul}[(0, 0)] [id CI] ''   75
   |   |           | |Shape_i{1} [id CF] ''   21
   |   |           | |Shape_i{0} [id Y] ''   3
   |   |           |Shape_i{0} [id CG] ''   20
   |   |<CudaNdarrayType(float32, matrix)> [id CJ]
   |   |CudaNdarrayConstant{[[ 0.]]} [id Q]
   |   |<CudaNdarrayType(float32, matrix)> [id CK]
   |   |<CudaNdarrayType(float32, matrix)> [id CL]
   |   |GpuElemwise{Composite{((i0 * i1) + (i2 * i3))}}[(0, 1)] [id CM] '' 
  131
   |   | |CudaNdarrayConstant{[[ 600.]]} [id BF]
   |   | |GpuDimShuffle{0,x} [id CN] ''   126
   |   | | |GpuCAReduce{add}{0,1} [id CO] ''   116
   |   | |   |GpuElemwise{Composite{(i0 * (i1 * i2))}}[(0, 2)] [id CP] ''   
104
   |   | |     |<CudaNdarrayType(float32, matrix)> [id I]
   |   | |     |<CudaNdarrayType(float32, matrix)> [id J]
   |   | |     |GpuReshape{2} [id CQ] ''   92
   |   | |       |GpuAlloc [id CR] ''   65
   |   | |       | |GpuElemwise{Composite{((i0 * i1) / i2)}}[(0, 1)] [id 
CS] ''   54
   |   | |       | | |GpuElemwise{maximum,no_inplace} [id V] ''   33
   |   | |       | | |GpuElemwise{maximum,no_inplace} [id CT] ''   36
   |   | |       | | | |GpuDimShuffle{1,0} [id CU] ''   26
   |   | |       | | | | |<CudaNdarrayType(float32, matrix)> [id CV]
   |   | |       | | | |CudaNdarrayConstant{[[ 0.]]} [id Q]
   |   | |       | | |GpuElemwise{Composite{(i0 * (i1 + i2))},no_inplace} 
[id CW] ''   46
   |   | |       | |   |GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} 
[id R] ''   39
   |   | |       | |   |GpuElemwise{maximum,no_inplace} [id CT] ''   36
   |   | |       | |   |GpuElemwise{maximum,no_inplace} [id CX] ''   35
   |   | |       | |     |GpuDimShuffle{1,0} [id CY] ''   25
   |   | |       | |     | |<CudaNdarrayType(float32, matrix)> [id CZ]
   |   | |       | |     |CudaNdarrayConstant{[[ 0.]]} [id Q]
   |   | |       | |Shape_i{0} [id Y] ''   3
   |   | |       | |TensorConstant{1} [id Z]
   |   | |       | |Shape_i{1} [id CF] ''   21
   |   | |       | |Shape_i{0} [id CG] ''   20
   |   | |       |MakeVector{dtype='int64'} [id CH] ''   83
   |   | |CudaNdarrayConstant{[[ 360.]]} [id BX]
   |   | |GpuDimShuffle{0,x} [id DA] ''   127
   |   |   |GpuCAReduce{add}{0,1} [id DB] ''   117
   |   |     |GpuElemwise{Mul}[(0, 1)] [id DC] ''   105
   |   |       |<CudaNdarrayType(float32, matrix)> [id J]
   |   |       |GpuReshape{2} [id DD] ''   93
   |   |         |GpuAlloc [id DE] ''   66
   |   |         | |GpuElemwise{Composite{((i0 * i1) / i2)}}[(0, 1)] [id 
DF] ''   55
   |   |         | | |GpuElemwise{maximum,no_inplace} [id V] ''   33
   |   |         | | |GpuElemwise{maximum,no_inplace} [id CX] ''   35
   |   |         | | |GpuElemwise{Composite{(i0 * (i1 + i2))},no_inplace} 
[id CW] ''   46
   |   |         | |Shape_i{0} [id Y] ''   3
   |   |         | |TensorConstant{1} [id Z]
   |   |         | |Shape_i{1} [id CF] ''   21
   |   |         | |Shape_i{0} [id CG] ''   20
   |   |         |MakeVector{dtype='int64'} [id CH] ''   83
   |   |CudaNdarrayConstant{[[-120.]]} [id BT]
   |   |GpuDimShuffle{0,x} [id DG] ''   125
   |     |GpuCAReduce{add}{0,1} [id DH] ''   115
   |       |GpuElemwise{Mul}[(0, 1)] [id CB] ''   103
   |<CudaNdarrayType(float32, matrix)> [id DI]
   |CudaNdarrayConstant{[[ 0.]]} [id Q]
   |<CudaNdarrayType(float32, matrix)> [id DJ]
   |<CudaNdarrayType(float32, matrix)> [id DK]
   |GpuElemwise{Composite{((i0 * i1) + (i2 * i3))}}[(0, 1)] [id DL] ''   120
   | |CudaNdarrayConstant{[[ 600.]]} [id BF]
   | |GpuDimShuffle{0,x} [id DM] ''   109
   | | |GpuCAReduce{add}{0,1} [id DN] ''   97
   | |   |GpuElemwise{Composite{(i0 * (i1 * i2))}}[(0, 2)] [id DO] ''   87
   | |     |<CudaNdarrayType(float32, matrix)> [id I]
   | |     |<CudaNdarrayType(float32, matrix)> [id J]
   | |     |GpuReshape{2} [id DP] ''   79
   | |       |GpuAlloc [id DQ] ''   51
   | |       | |GpuElemwise{TrueDiv}[(0, 0)] [id DR] ''   43
   | |       | | |GpuElemwise{maximum,no_inplace} [id DS] ''   30
   | |       | | | |GpuDimShuffle{1,0} [id DT] ''   12
   | |       | | | | |<CudaNdarrayType(float32, matrix)> [id DU]
   | |       | | | |CudaNdarrayConstant{[[ 0.]]} [id Q]
   | |       | | |GpuElemwise{add,no_inplace} [id DV] ''   38
   | |       | |   |GpuElemwise{maximum,no_inplace} [id DS] ''   30
   | |       | |   |GpuElemwise{maximum,no_inplace} [id DW] ''   31
   | |       | |     |GpuDimShuffle{1,0} [id DX] ''   13
   | |       | |     | |<CudaNdarrayType(float32, matrix)> [id DY]
   | |       | |     |CudaNdarrayConstant{[[ 0.]]} [id Q]
   | |       | |Shape_i{0} [id Y] ''   3
   | |       | |TensorConstant{1} [id Z]
   | |       | |Shape_i{1} [id DZ] ''   11
   | |       | | |<CudaNdarrayType(float32, matrix)> [id DU]
   | |       | |Shape_i{0} [id EA] ''   10
   | |       |   |<CudaNdarrayType(float32, matrix)> [id DU]
   | |       |MakeVector{dtype='int64'} [id EB] ''   70
   | |         |Elemwise{Mul}[(0, 0)] [id EC] ''   60
   | |         | |Shape_i{1} [id DZ] ''   11
   | |         | |Shape_i{0} [id Y] ''   3
   | |         |Shape_i{0} [id EA] ''   10
   | |CudaNdarrayConstant{[[ 120.]]} [id E]
   | |GpuDimShuffle{0,x} [id ED] ''   110
   |   |GpuCAReduce{add}{0,1} [id EE] ''   98
   |     |GpuElemwise{Mul}[(0, 1)] [id EF] ''   88
   |       |<CudaNdarrayType(float32, matrix)> [id J]
   |       |GpuReshape{2} [id EG] ''   80
   |         |GpuAlloc [id EH] ''   52
   |         | |GpuElemwise{TrueDiv}[(0, 0)] [id EI] ''   44
   |         | | |GpuElemwise{maximum,no_inplace} [id DW] ''   31
   |         | | |GpuElemwise{add,no_inplace} [id DV] ''   38
   |         | |Shape_i{0} [id Y] ''   3
   |         | |TensorConstant{1} [id Z]
   |         | |Shape_i{1} [id EJ] ''   15
   |         | | |<CudaNdarrayType(float32, matrix)> [id DY]
   |         | |Shape_i{0} [id EK] ''   14
   |         |   |<CudaNdarrayType(float32, matrix)> [id DY]
   |         |MakeVector{dtype='int64'} [id EL] ''   71
   |           |Elemwise{Mul}[(0, 0)] [id EM] ''   61
   |           | |Shape_i{1} [id EJ] ''   15
   |           | |Shape_i{0} [id Y] ''   3
   |           |Shape_i{0} [id EK] ''   14
   |GpuElemwise{Composite{((i0 + i1) + i2)}}[(0, 1)] [id EN] ''   142
     |GpuElemwise{Mul}[(0, 1)] [id EO] ''   119
     | |CudaNdarrayConstant{[[ 360.]]} [id BX]
     | |GpuDimShuffle{0,x} [id EP] ''   108
     |   |GpuCAReduce{add}{0,1} [id EQ] ''   96
     |     |GpuElemwise{Composite{(i0 * (i1 * i2))}}[(0, 2)] [id ER] ''   86
     |       |<CudaNdarrayType(float32, matrix)> [id I]
     |       |<CudaNdarrayType(float32, matrix)> [id J]
     |       |GpuReshape{2} [id ES] ''   78
     |         |GpuAlloc [id ET] ''   50
     |         | |GpuElemwise{TrueDiv}[(0, 0)] [id EU] ''   42
     |         | | |GpuElemwise{maximum,no_inplace} [id EV] ''   29
     |         | | | |GpuDimShuffle{1,0} [id EW] ''   7
     |         | | | | |<CudaNdarrayType(float32, matrix)> [id EX]
     |         | | | |CudaNdarrayConstant{[[ 0.]]} [id Q]
     |         | | |GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} [id 
EY] ''   37
     |         | |   |GpuElemwise{maximum,no_inplace} [id EV] ''   29
     |         | |   |GpuElemwise{maximum,no_inplace} [id EZ] ''   27
     |         | |   | |GpuDimShuffle{1,0} [id FA] ''   2
     |         | |   | | |<CudaNdarrayType(float32, matrix)> [id FB]
     |         | |   | |CudaNdarrayConstant{[[ 0.]]} [id Q]
     |         | |   |GpuElemwise{maximum,no_inplace} [id FC] ''   28
     |         | |     |GpuDimShuffle{1,0} [id FD] ''   4
     |         | |     | |<CudaNdarrayType(float32, matrix)> [id FE]
     |         | |     |CudaNdarrayConstant{[[ 0.]]} [id Q]
     |         | |Shape_i{0} [id Y] ''   3
     |         | |TensorConstant{1} [id Z]
     |         | |Shape_i{1} [id FF] ''   9
     |         | | |<CudaNdarrayType(float32, matrix)> [id EX]
     |         | |Shape_i{0} [id FG] ''   8
     |         |   |<CudaNdarrayType(float32, matrix)> [id EX]
     |         |MakeVector{dtype='int64'} [id FH] ''   69
     |           |Elemwise{Mul}[(0, 0)] [id FI] ''   59
     |           | |Shape_i{1} [id FF] ''   9
     |           | |Shape_i{0} [id Y] ''   3
     |           |Shape_i{0} [id FG] ''   8
     |GpuElemwise{Composite{(((i0 * i1 * maximum(i2, i3)) / (maximum(i2, 
i3) + maximum(i4, i3))) + ((i5 * i6 * maximum(i4, i3)) / (maximum(i2, i3) + 
maximum(i4, i3))))},no_inplace} [id FJ] ''   129
     | |CudaNdarrayConstant{[[ 600.]]} [id BF]
     | |GpuDimShuffle{0,x} [id FK] ''   123
     | | |GpuCAReduce{add}{0,1} [id FL] ''   113
     | |   |GpuElemwise{Mul}[(0, 1)] [id FM] ''   101
     | |     |<CudaNdarrayType(float32, matrix)> [id I]
     | |     |GpuElemwise{Mul}[(0, 1)] [id FN] ''   84
     | |       |<CudaNdarrayType(float32, matrix)> [id J]
     | |       |GpuReshape{2} [id FO] ''   76
     | |         |GpuAlloc [id FP] ''   48
     | |         | |GpuElemwise{TrueDiv}[(0, 0)] [id FQ] ''   40
     | |         | | |GpuElemwise{maximum,no_inplace} [id EZ] ''   27
     | |         | | |GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} 
[id EY] ''   37
     | |         | |Shape_i{0} [id Y] ''   3
     | |         | |TensorConstant{1} [id Z]
     | |         | |Shape_i{1} [id FR] ''   1
     | |         | | |<CudaNdarrayType(float32, matrix)> [id FB]
     | |         | |Shape_i{0} [id FS] ''   0
     | |         |   |<CudaNdarrayType(float32, matrix)> [id FB]
     | |         |MakeVector{dtype='int64'} [id FT] ''   67
     | |           |Elemwise{Mul}[(0, 0)] [id FU] ''   57
     | |           | |Shape_i{1} [id FR] ''   1
     | |           | |Shape_i{0} [id Y] ''   3
     | |           |Shape_i{0} [id FS] ''   0
     | |<CudaNdarrayType(float32, matrix)> [id FV]
     | |CudaNdarrayConstant{[[ 0.]]} [id Q]
     | |<CudaNdarrayType(float32, matrix)> [id FW]
     | |CudaNdarrayConstant{[[-360.]]} [id FX]
     | |GpuDimShuffle{0,x} [id FY] ''   106
     |   |GpuCAReduce{add}{0,1} [id FZ] ''   94
     |     |GpuElemwise{Mul}[(0, 1)] [id FN] ''   84
     |GpuElemwise{Mul}[(0, 1)] [id GA] ''   118
       |CudaNdarrayConstant{[[ 120.]]} [id E]
       |GpuDimShuffle{0,x} [id GB] ''   107
         |GpuCAReduce{add}{0,1} [id GC] ''   95
           |GpuElemwise{Mul}[(0, 1)] [id GD] ''   85
             |<CudaNdarrayType(float32, matrix)> [id J]
             |GpuReshape{2} [id GE] ''   77
               |GpuAlloc [id GF] ''   49
               | |GpuElemwise{TrueDiv}[(0, 0)] [id GG] ''   41
               | | |GpuElemwise{maximum,no_inplace} [id FC] ''   28
               | | |GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} [id 
EY] ''   37
               | |Shape_i{0} [id Y] ''   3
               | |TensorConstant{1} [id Z]
               | |Shape_i{1} [id GH] ''   6
               | | |<CudaNdarrayType(float32, matrix)> [id FE]
               | |Shape_i{0} [id GI] ''   5
               |   |<CudaNdarrayType(float32, matrix)> [id FE]
               |MakeVector{dtype='int64'} [id GJ] ''   68
                 |Elemwise{Mul}[(0, 0)] [id GK] ''   58
                 | |Shape_i{1} [id GH] ''   6
                 | |Shape_i{0} [id Y] ''   3
                 |Shape_i{0} [id GI] ''   5
GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)] [id GL] ''   
139
 |<CudaNdarrayType(float32, matrix)> [id BR]
 |GpuElemwise{Mul}[(0, 1)] [id GM] ''   134
 | |CudaNdarrayConstant{[[ 600.]]} [id BF]
 | |GpuDimShuffle{0,x} [id BG] ''   124
 |GpuElemwise{Composite{(((i0 * i1 * maximum(i2, i3)) / (maximum(i2, i3) + 
maximum(i4, i3))) + ((i5 * i6 * maximum(i4, i3)) / (maximum(i2, i3) + 
maximum(i4, i3))))},no_inplace} [id BE] ''   130
 |CudaNdarrayConstant{[[ 0.]]} [id Q]
GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)] [id GN] ''   
140
 |<CudaNdarrayType(float32, matrix)> [id BS]
 |GpuElemwise{Mul}[(0, 1)] [id GO] ''   135
 | |CudaNdarrayConstant{[[-120.]]} [id BT]
 | |GpuDimShuffle{0,x} [id BU] ''   111
 |GpuElemwise{Composite{(((i0 * i1 * maximum(i2, i3)) / (maximum(i2, i3) + 
maximum(i4, i3))) + ((i5 * i6 * maximum(i4, i3)) / (maximum(i2, i3) + 
maximum(i4, i3))))},no_inplace} [id BE] ''   130
 |CudaNdarrayConstant{[[ 0.]]} [id Q]
GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)] [id GP] ''   
148
 |<CudaNdarrayType(float32, matrix)> [id CJ]
 |GpuElemwise{Mul}[(0, 1)] [id GQ] ''   144
 | |CudaNdarrayConstant{[[ 360.]]} [id BX]
 | |GpuDimShuffle{0,x} [id BY] ''   136
 |GpuElemwise{Composite{((((i0 * i1 * maximum(i2, i3)) / Composite{((i0 + 
i1) + i2)}(maximum(i2, i3), maximum(i4, i3), maximum(i5, i3))) + (i6 * 
(maximum(i4, i3) / Composite{((i0 + i1) + i2)}(maximum(i2, i3), maximum(i4, 
i3), maximum(i5, i3))))) + ((i7 * i8 * maximum(i5, i3)) / Composite{((i0 + 
i1) + i2)}(maximum(i2, i3), maximum(i4, i3), maximum(i5, 
i3))))},no_inplace} [id BW] ''   141
 |CudaNdarrayConstant{[[ 0.]]} [id Q]
GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)] [id GR] ''   
146
 |<CudaNdarrayType(float32, matrix)> [id CK]
 |GpuElemwise{Composite{((i0 * i1) + (i2 * i3))}}[(0, 1)] [id CM] ''   131
 |GpuElemwise{Composite{((((i0 * i1 * maximum(i2, i3)) / Composite{((i0 + 
i1) + i2)}(maximum(i2, i3), maximum(i4, i3), maximum(i5, i3))) + (i6 * 
(maximum(i4, i3) / Composite{((i0 + i1) + i2)}(maximum(i2, i3), maximum(i4, 
i3), maximum(i5, i3))))) + ((i7 * i8 * maximum(i5, i3)) / Composite{((i0 + 
i1) + i2)}(maximum(i2, i3), maximum(i4, i3), maximum(i5, 
i3))))},no_inplace} [id BW] ''   141
 |CudaNdarrayConstant{[[ 0.]]} [id Q]
GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)] [id GS] ''   
147
 |<CudaNdarrayType(float32, matrix)> [id CL]
 |GpuElemwise{Mul}[(0, 1)] [id GT] ''   143
 | |CudaNdarrayConstant{[[-120.]]} [id BT]
 | |GpuDimShuffle{0,x} [id DG] ''   125
 |GpuElemwise{Composite{((((i0 * i1 * maximum(i2, i3)) / Composite{((i0 + 
i1) + i2)}(maximum(i2, i3), maximum(i4, i3), maximum(i5, i3))) + (i6 * 
(maximum(i4, i3) / Composite{((i0 + i1) + i2)}(maximum(i2, i3), maximum(i4, 
i3), maximum(i5, i3))))) + ((i7 * i8 * maximum(i5, i3)) / Composite{((i0 + 
i1) + i2)}(maximum(i2, i3), maximum(i4, i3), maximum(i5, 
i3))))},no_inplace} [id BW] ''   141
 |CudaNdarrayConstant{[[ 0.]]} [id Q]
GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)] [id GU] ''   
152
 |<CudaNdarrayType(float32, matrix)> [id DI]
 |GpuElemwise{Composite{((i0 + i1) + i2)}}[(0, 1)] [id C] ''   145
 |GpuElemwise{Composite{(((i0 * (maximum(i1, i2) / Composite{((i0 + i1) + 
i2)}(maximum(i1, i2), maximum(i3, i2), maximum(i4, i2)))) + (i5 * 
(maximum(i3, i2) / Composite{((i0 + i1) + i2)}(maximum(i1, i2), maximum(i3, 
i2), maximum(i4, i2))))) + (i6 * (maximum(i4, i2) / Composite{((i0 + i1) + 
i2)}(maximum(i1, i2), maximum(i3, i2), maximum(i4, i2)))))},no_inplace} [id 
B] ''   149
 |CudaNdarrayConstant{[[ 0.]]} [id Q]
GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)] [id GV] ''   
151
 |<CudaNdarrayType(float32, matrix)> [id DJ]
 |GpuElemwise{Composite{((i0 * i1) + (i2 * i3))}}[(0, 1)] [id DL] ''   120
 |GpuElemwise{Composite{(((i0 * (maximum(i1, i2) / Composite{((i0 + i1) + 
i2)}(maximum(i1, i2), maximum(i3, i2), maximum(i4, i2)))) + (i5 * 
(maximum(i3, i2) / Composite{((i0 + i1) + i2)}(maximum(i1, i2), maximum(i3, 
i2), maximum(i4, i2))))) + (i6 * (maximum(i4, i2) / Composite{((i0 + i1) + 
i2)}(maximum(i1, i2), maximum(i3, i2), maximum(i4, i2)))))},no_inplace} [id 
B] ''   149
 |CudaNdarrayConstant{[[ 0.]]} [id Q]
GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)] [id GW] ''   
150
 |<CudaNdarrayType(float32, matrix)> [id DK]Function profiling
 |GpuElemwise{Composite{((i0 + i1) + i2)}}[(0, 1)] [id EN] ''   142
 |GpuElemwise{Composite{(((i0 * (maximum(i1, i2) / Composite{((i0 + i1) + 
i2)}(maximum(i1, i2), maximum(i3, i2), maximum(i4, i2)))) + (i5 * 
(maximum(i3, i2) / Composite{((i0 + i1) + i2)}(maximum(i1, i2), maximum(i3, 
i2), maximum(i4, i2))))) + (i6 * (maximum(i4, i2) / Composite{((i0 + i1) + 
i2)}(maximum(i1, i2), maximum(i3, i2), maximum(i4, i2)))))},no_inplace} [id 
B] ''   149
 |CudaNdarrayConstant{[[ 0.]]} [id Q]
GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)] [id GX] ''   
138
 |<CudaNdarrayType(float32, matrix)> [id FV]
 |GpuElemwise{Mul}[(0, 1)] [id GY] ''   133
 | |CudaNdarrayConstant{[[ 600.]]} [id BF]
 | |GpuDimShuffle{0,x} [id FK] ''   123
 |GpuElemwise{Composite{(((i0 * i1 * maximum(i2, i3)) / (maximum(i2, i3) + 
maximum(i4, i3))) + ((i5 * i6 * maximum(i4, i3)) / (maximum(i2, i3) + 
maximum(i4, i3))))},no_inplace} [id FJ] ''   129
 |CudaNdarrayConstant{[[ 0.]]} [id Q]
GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)] [id GZ] ''   
137
 |<CudaNdarrayType(float32, matrix)> [id FW]
 |GpuElemwise{Mul}[(0, 1)] [id HA] ''   132
 | |CudaNdarrayConstant{[[-360.]]} [id FX]
 | |GpuDimShuffle{0,x} [id FY] ''   106
 |GpuElemwise{Composite{(((i0 * i1 * maximum(i2, i3)) / (maximum(i2, i3) + 
maximum(i4, i3))) + ((i5 * i6 * maximum(i4, i3)) / (maximum(i2, i3) + 
maximum(i4, i3))))},no_inplace} [id FJ] ''   129
 |CudaNdarrayConstant{[[ 0.]]} [id Q]
None



On Tuesday, 7 February 2017 23:52:17 UTC+2, nouiz wrote:
>
> There is a high quantity of GpuAlloc. What you have shown don't tell us 
> what need it in Theano. Can you run the theano function with profiling, and 
> before the script end call theano.debugprint(your_theano_function) and send 
> this output? It will tell us what need it in the graph.
>
> On Fri, Feb 3, 2017 at 4:22 AM Šarūnas S. <[email protected] <javascript:>> 
> wrote:
>
>> I wrote a script in theano and started profiling it. What I noticed is 
>> GPU spends most of the time in GpuAlloc . 
>>
>> Could somebody explain me why this is happening and how I could reduce it?
>> In C or C++ I would preallocate it, but not sure how to do this in 
>> theano.   
>>
>> I am running on Windows 8.1 with Nvidia GTX 1070 with Theano 
>> @ 0.9.0dev4.dev-3c0be3d94102ac6864b2e5ab52ae96d07c6375c6 
>>
>>
>> I am attaching extensive profile result below:
>>
>> Function profiling
>> ==================
>>   Message: Sum of all(2) printed profiles at exit excluding Scan op 
>> profile.
>>   Time in 200 calls to Function.__call__: 3.463001e+00s
>>   Time in Function.fn.__call__: 3.451001e+00s (99.653%)
>>   Time in thunks: 3.425293e+00s (98.911%)
>>   Total compile time: 1.413800e+01s
>>     Number of Apply nodes: 590
>>     Theano Optimizer time: 1.158200e+01s
>>        Theano validate time: 9.390018e-01s
>>     Theano Linker time (includes C, CUDA code generation/compiling): 
>> 2.107000e+00s
>>        Import time 3.500128e-02s
>>        Node make_thunk time 2.042000e+00s
>>            Node GpuCAReduce{add}{0,1}(GpuElemwise{Composite{(i0 * (i1 * 
>> i2))}}[(0, 2)].0) time 9.000063e-03s
>>            Node GpuCAReduce{add}{0,1}(GpuElemwise{Mul}[(0, 1)].0) time 
>> 7.999897e-03s
>>            Node GpuDimShuffle{0,x}(GpuCAReduce{add}{0,1}.0) time 
>> 6.999969e-03s
>>            Node Shape_i{1}(<CudaNdarrayType(float32, matrix)>) time 
>> 4.999876e-03s
>>            Node GpuElemwise{Mul}[(0, 1)](CudaNdarrayConstant{[[ 240.]]}, 
>> GpuDimShuffle{0,x}.0) time 4.999876e-03s
>>
>>
>> Time in all call to theano.grad() 0.000000e+00s
>> Time since theano import 41.580s
>> Class
>> ---
>> <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> 
>> <Class name>
>>   90.5%    90.5%       3.100s       3.37e-04s     C     9200      92   
>> theano.sandbox.cuda.basic_ops.GpuAlloc
>>    7.4%    97.9%       0.254s       4.19e-06s     C    60600     606   
>> theano.sandbox.cuda.basic_ops.GpuElemwise
>>    1.0%    98.9%       0.034s       2.77e-06s     C    12200     122   
>> theano.sandbox.cuda.basic_ops.GpuCAReduce
>>    0.5%    99.4%       0.017s       1.84e-06s     C     9200      92   
>> theano.sandbox.cuda.basic_ops.GpuReshape
>>    0.5%    99.9%       0.016s       7.45e-07s     C    21400     214   
>> theano.sandbox.cuda.basic_ops.GpuDimShuffle
>>    0.1%    99.9%       0.003s       1.57e-06s     C     1900      19   
>> theano.tensor.elemwise.Elemwise
>>    0.1%   100.0%       0.002s       5.24e-07s     C     3800      38   
>> theano.compile.ops.Shape_i
>>    0.0%   100.0%       0.000s       0.00e+00s     C     1900      19   
>> theano.tensor.opt.MakeVector
>>    ... (remaining 0 Classes account for   0.00%(0.00s) of the runtime)
>>
>>
>> Ops
>> ---
>> <% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> 
>> <Op name>
>>   90.5%    90.5%       3.100s       3.37e-04s     C     9200       92   
>> GpuAlloc
>>    1.7%    92.2%       0.058s       4.41e-06s     C     13100      131   
>> GpuElemwise{Mul}[(0, 1)]
>>    1.0%    93.2%       0.034s       3.21e-06s     C     10600      106   
>> GpuElemwise{maximum,no_inplace}
>>    1.0%    94.2%       0.034s       2.77e-06s     C     12200      122   
>> GpuCAReduce{add}{0,1}
>>    0.7%    94.8%       0.023s       3.54e-06s     C     6500       65   
>> GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)}}[(0, 0)]
>>    0.5%    95.4%       0.018s       3.27e-06s     C     5500       55   
>> GpuElemwise{mul,no_inplace}
>>    0.5%    95.9%       0.018s       4.61e-06s     C     3900       39   
>> GpuElemwise{Composite{((i0 * i1) / i2)}}[(0, 1)]
>>    0.5%    96.4%       0.017s       1.84e-06s     C     9200       92   
>> GpuReshape{2}
>>    0.4%    96.8%       0.014s       4.33e-06s     C     3200       32   
>> GpuElemwise{Composite{(i0 * (i1 * i2))}}[(0, 2)]
>>    0.2%    97.0%       0.008s       8.69e-07s     C     9200       92   
>> GpuDimShuffle{1,0}
>>    0.2%    97.3%       0.008s       5.33e-06s     C     1500       15   
>> GpuElemwise{Composite{((i0 * i1) / i2)},no_inplace}
>>    0.2%    97.5%       0.008s       6.52e-07s     C     12200      122   
>> GpuDimShuffle{0,x}
>>    0.2%    97.7%       0.007s       4.38e-06s     C     1600       16   
>> GpuElemwise{Composite{(((i0 * i1 * maximum(i2, i3)) / (maximum(i2, i3) + 
>> maximum(i4, i3))) + ((i5 * i6 * maximum(i4, i3)) / (maximum(i2, i3) + 
>> maximum(i4, i3))))},no_inplace}
>>    0.2%    97.9%       0.007s       2.92e-06s     C     2400       24   
>> GpuElemwise{Composite{maximum(((i0 + i1) - i2), i3)},no_inplace}
>>    0.2%    98.1%       0.007s       8.75e-06s     C      800        8   
>> GpuElemwise{Composite{((i0 * i1 * i2) / i3)}}[(0, 2)]
>>    0.2%    98.3%       0.007s       8.73e-06s     C      800        8   
>> GpuElemwise{Composite{((i0 * i1) / i2)}}[(0, 0)]
>>    0.2%    98.5%       0.006s       3.54e-06s     C     1700       17   
>> GpuElemwise{true_div,no_inplace}
>>    0.1%    98.6%       0.005s       5.02e-06s     C     1000       10   
>> GpuElemwise{Composite{(i0 * (i1 + i2))},no_inplace}
>>    0.1%    98.8%       0.005s       9.99e-06s     C      500        5   
>> GpuElemwise{Composite{(((i0 + i1) + i2) + i3)},no_inplace}
>>    0.1%    98.9%       0.004s       6.65e-06s     C      600        6   
>> GpuElemwise{Composite{(((i0 * (maximum(i1, i2) / Composite{((i0 + i1) + 
>> i2)}(maximum(i1, i2), maximum(i3, i2), maximum(i4, i2)))) + ((i5 * i6 * 
>> maximum(i3, i2)) / Composite{((i0 + i1) + i2)}(maximum(i1, i2), maximum(
>> i3, i2), maximum(i4, i2)))) + ((i7 * i8 * maximum(i4, i2)) / Composite{((i0 
>> + i1) + i2)}(maximum(i1, i2), maximum(i3, i2), maximum(i4, i2))))},
>> no_inplace}
>>    ... (remaining 33 Ops account for   1.11%(0.04s) of the runtime)
>>
>>
>> Apply
>> ------
>> <% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
>>    1.6%     1.6%       0.055s       5.50e-04s    100   188   GpuAlloc(
>> GpuElemwise{Composite{((i0 * i1) / i2)},no_inplace}.0, TensorConstant{
>> 1326}, TensorConstant{1}, Shape_i{1}.0, Shape_i{0}.0)
>>    1.6%     3.2%       0.055s       5.50e-04s    100   217   GpuAlloc(
>> GpuElemwise{Composite{((i0 * i1) / i2)}}[(0, 1)].0, TensorConstant{1326}, 
>> TensorConstant{1}, Shape_i{1}.0, Shape_i{0}.0)
>>    1.6%     4.8%       0.055s       5.50e-04s    100   224   GpuAlloc(
>> GpuElemwise{Composite{((i0 * i1 * i2) / i3)}}[(0, 2)].0, TensorConstant{
>> 1326}, TensorConstant{1}, Shape_i{1}.0, Shape_i{0}.0)
>>    1.6%     6.4%       0.055s       5.50e-04s    100   183   GpuAlloc(
>> GpuElemwise{Composite{((i0 * i1) / i2)}}[(0, 1)].0, TensorConstant{1326}, 
>> TensorConstant{1}, Shape_i{1}.0, Shape_i{0}.0)
>>    1.6%     8.0%       0.054s       5.39e-04s    100   186   GpuAlloc(
>> GpuElemwise{Composite{((i0 * i1) / i2)},no_inplace}.0, TensorConstant{
>> 1326}, TensorConstant{1}, Shape_i{1}.0, Shape_i{0}.0)
>>    1.5%     9.5%       0.053s       5.30e-04s    100   154   GpuAlloc(
>> GpuElemwise{true_div
>>
>

-- 

--- 
You received this message because you are subscribed to the Google Groups 
"theano-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Reply via email to