I suddenly get the HEAD version of libgpuarray works I found that if I increase the size of the matrix, the error will appear. The first few rows of the matrix are correct, and then there will be errors for the remaining rows. I guess there is a synchronization or memory bug.
$ python3 cho.py row #0: err=0 (max=0) row #1: err=0 (max=0) row #2: err=0 (max=0) row #3: err=1.77636e-15 (max=1.77636e-15) row #4: err=0 (max=0) row #5: err=1.77982e-15 (max=1.77636e-15) row #6: err=1.14439e-16 (max=1.11022e-16) row #7: err=6.245e-17 (max=5.55112e-17) row #8: err=1.79104e-15 (max=1.77636e-15) row #9: err=1.84778e-15 (max=1.77636e-15) row #10: err=1.83628e-15 (max=1.77636e-15) row #11: err=7.13054e-16 (max=6.66134e-16) row #12: err=8.55484e-17 (max=8.32667e-17) row #13: err=7.19641e-16 (max=4.44089e-16) row #14: err=2.30555e-16 (max=1.11022e-16) row #15: err=1.93574e-15 (max=1.77636e-15) row #16: err=3.61888e-16 (max=2.22045e-16) row #17: err=1.94548e-15 (max=1.77636e-15) row #18: err=1.81003e-15 (max=1.77636e-15) row #19: err=1.85793e-15 (max=1.77636e-15) row #20: err=1.93489e-15 (max=1.77636e-15) row #21: err=2.10577e-15 (max=1.77636e-15) row #22: err=9.14588e-16 (max=4.44089e-16) row #23: err=7.63657e-16 (max=4.44089e-16) row #24: err=1.42114e-15 (max=8.88178e-16) row #25: err=3.80154e-15 (max=3.55271e-15) row #26: err=3.66222e-15 (max=3.55271e-15) row #27: err=1.06328e-15 (max=8.88178e-16) row #28: err=2.31959e-15 (max=1.77636e-15) row #29: err=3.65102e-15 (max=3.55271e-15) row #30: err=9.84652e-16 (max=4.44089e-16) row #31: err=1.98222e-15 (max=1.33227e-15) row #32: err=1.69428e-15 (max=8.88178e-16) row #33: err=2.39616e-15 (max=1.77636e-15) row #34: err=1.29213e-15 (max=8.88178e-16) row #35: err=1.04169e-15 (max=4.44089e-16) row #36: err=2.56552e-15 (max=1.77636e-15) row #37: err=1.92892e-15 (max=8.88178e-16) row #38: err=2.20448e-15 (max=1.77636e-15) row #39: err=1.49001e-15 (max=6.66134e-16) row #40: err=1.17059e-15 (max=5.55112e-16) row #41: err=1.77533e-15 (max=8.88178e-16) row #42: err=2.27739e-15 (max=1.77636e-15) row #43: err=1.47627e-15 (max=6.66134e-16) row #44: err=2.09264e-15 (max=1.33227e-15) row #45: err=1.81502e-15 (max=8.88178e-16) row #46: err=1.84387e-15 (max=8.88178e-16) row #47: err=1.06552e-15 (max=4.44089e-16) row #48: err=2.76471e-15 (max=1.77636e-15) row #49: err=2.18163e-15 (max=1.33227e-15) row #50: err=3.22704e-15 (max=1.77636e-15) row #51: err=3.64846e-15 (max=1.77636e-15) row #52: err=1.66905e-15 (max=6.66134e-16) row #53: err=1.81576e-15 (max=1.11022e-15) row #54: err=2.41371e-15 (max=1.77636e-15) row #55: err=3.9903e-15 (max=3.55271e-15) row #56: err=3.00212e-15 (max=1.77636e-15) row #57: err=3.06269e-15 (max=1.77636e-15) row #58: err=2.50664e-15 (max=1.77636e-15) row #59: err=3.85325e-15 (max=3.55271e-15) row #60: err=3.55556e-15 (max=1.77636e-15) row #61: err=2.1962e-15 (max=8.88178e-16) row #62: err=3.49413e-15 (max=1.77636e-15) row #63: err=3.29766e-15 (max=1.77636e-15) row #64: err=2.4585e-15 (max=1.33227e-15) row #65: err=2.12112e-15 (max=8.88178e-16) row #66: err=3.71809e-15 (max=1.77636e-15) row #67: err=2.7659e-15 (max=8.88178e-16) row #68: err=3.32757e-15 (max=1.77636e-15) row #69: err=2.41245e-15 (max=8.60423e-16) row #70: err=3.99688e-15 (max=1.9984e-15) row #71: err=2.52257e-15 (max=8.88178e-16) row #72: err=3.55973e-15 (max=1.55431e-15) row #73: err=2.7763e-15 (max=8.88178e-16) row #74: err=4.40704e-15 (max=3.55271e-15) row #75: err=3.55809e-15 (max=1.77636e-15) row #76: err=3.04663e-15 (max=1.77636e-15) row #77: err=2.85651e-15 (max=1.11022e-15) row #78: err=4.05814e-15 (max=1.77636e-15) row #79: err=3.33612e-15 (max=1.32533e-15) row #80: err=3.20748e-15 (max=1.77636e-15) row #81: err=3.8984e-15 (max=1.77636e-15) row #82: err=3.5669e-15 (max=1.22125e-15) row #83: err=4.28332e-15 (max=2.22045e-15) row #84: err=3.64221e-15 (max=1.33227e-15) row #85: err=4.83762e-15 (max=3.55271e-15) row #86: err=4.0986e-15 (max=1.77636e-15) row #87: err=3.60163e-15 (max=1.77636e-15) row #88: err=5.06272e-15 (max=3.55271e-15) row #89: err=3.68688e-15 (max=1.77636e-15) row #90: err=7.07646e-15 (max=5.32907e-15) row #91: err=3.83584e-15 (max=1.05471e-15) row #92: err=4.50821e-15 (max=1.77636e-15) row #93: err=5.47632e-15 (max=1.77636e-15) row #94: err=4.46046e-15 (max=1.44329e-15) row #95: err=5.61405e-15 (max=3.55271e-15) row #96: err=5.06176e-15 (max=2.22045e-15) row #97: err=3.81964e-15 (max=1.55431e-15) row #98: err=4.37526e-15 (max=1.77636e-15) row #99: err=3.98392e-15 (max=1.55431e-15) row #100: err=4.91222e-15 (max=1.77636e-15) row #101: err=3.35853e-15 (max=1.22125e-15) row #102: err=4.78829e-15 (max=2.22045e-15) row #103: err=4.60413e-15 (max=1.33227e-15) row #104: err=4.5791e-15 (max=1.38778e-15) row #105: err=5.45668e-15 (max=1.9984e-15) row #106: err=7.5096e-15 (max=3.55271e-15) row #107: err=4.63925e-15 (max=1.33227e-15) row #108: err=5.44862e-15 (max=2.44249e-15) row #109: err=4.83685e-15 (max=2.22045e-15) row #110: err=4.11954e-15 (max=1.55431e-15) row #111: err=5.48967e-15 (max=1.9984e-15) row #112: err=4.78231e-15 (max=1.77636e-15) row #113: err=6.65255e-15 (max=2.22045e-15) row #114: err=6.33143e-15 (max=3.55271e-15) row #115: err=7.17902e-15 (max=3.21965e-15) row #116: err=6.00826e-15 (max=1.83187e-15) row #117: err=6.52156e-15 (max=2.22045e-15) row #118: err=4.56739e-15 (max=1.55431e-15) row #119: err=5.78508e-15 (max=2.22045e-15) row #120: err=6.4643e-15 (max=2.08167e-15) row #121: err=4.31762e-15 (max=1.33227e-15) row #122: err=7.30575e-15 (max=3.55271e-15) row #123: err=5.16371e-15 (max=1.55431e-15) row #124: err=6.8954e-15 (max=2.66454e-15) row #125: err=6.68844e-15 (max=1.9984e-15) row #126: err=6.36886e-15 (max=2.10942e-15) row #127: err=8.18275e-15 (max=3.10862e-15) row #128: err=7.58721e-15 (max=2.9976e-15) row #129: err=8.76019e-15 (max=2.44249e-15) row #130: err=8.60251e-15 (max=4.16334e-15) row #131: err=7.45057e-15 (max=1.88738e-15) row #132: err=7.273e-15 (max=1.9984e-15) row #133: err=8.46628e-15 (max=2.44249e-15) row #134: err=6.03992e-15 (max=1.9984e-15) row #135: err=8.54499e-15 (max=3.55271e-15) row #136: err=7.33755e-15 (max=3.10862e-15) row #137: err=1.32453e-14 (max=7.10543e-15) row #138: err=9.21473e-15 (max=2.88658e-15) row #139: err=1.38584e-14 (max=8.21565e-15) row #140: err=9.92134e-15 (max=4.77396e-15) row #141: err=8.12191e-15 (max=3.55271e-15) row #142: err=8.54742e-15 (max=3.05311e-15) row #143: err=1.1525e-14 (max=3.9968e-15) row #144: err=9.56483e-15 (max=3.55271e-15) row #145: err=7.57599e-15 (max=2.16493e-15) row #146: err=9.08358e-15 (max=3.77476e-15) row #147: err=1.261e-14 (max=4.16334e-15) row #148: err=1.04084e-14 (max=3.88578e-15) row #149: err=1.52547e-14 (max=6.21725e-15) row #150: err=1.34445e-14 (max=6.21725e-15) row #151: err=1.28415e-14 (max=5.32907e-15) row #152: err=1.37001e-14 (max=4.88498e-15) row #153: err=104.091 (max=38.2524) row #154: err=90.9855 (max=28.555) row #155: err=114.057 (max=35.5966) row #156: err=90.1876 (max=34.3175) row #157: err=114.274 (max=41.0308) row #158: err=68.7615 (max=29.8493) row #159: err=102.592 (max=45.7777) row #160: err=88.559 (max=39.3841) row #161: err=102.897 (max=37.4962) row #162: err=89.7443 (max=39.1052) row #163: err=91.8647 (max=40.6695) row #164: err=92.5436 (max=39.478) row #165: err=67.0603 (max=22.3479) row #166: err=97.741 (max=35.374) row #167: err=88.4444 (max=33.1283) row #168: err=66.4308 (max=29.6943) row #169: err=76.6372 (max=40.7606) row #170: err=68.7239 (max=28.0245) row #171: err=91.2993 (max=48.1353) row #172: err=94.0889 (max=48.0026) row #173: err=76.6705 (max=33.9253) row #174: err=78.756 (max=39.5833) row #175: err=51.6685 (max=29.4995) row #176: err=74.8719 (max=28.4035) row #177: err=82.6127 (max=35.2276) row #178: err=43.8165 (max=20.9576) row #179: err=67.3553 (max=27.4942) row #180: err=74.5054 (max=39.5853) row #181: err=52.8585 (max=29.805) row #182: err=54.6962 (max=22.4845) row #183: err=49.1812 (max=26.9341) row #184: err=79.5791 (max=37.3105) row #185: err=36.5226 (max=22.6301) row #186: err=54.368 (max=37.7491) row #187: err=31.9472 (max=16.7787) row #188: err=59.4599 (max=33.4338) row #189: err=67.0638 (max=49.7558) row #190: err=54.539 (max=42.0158) row #191: err=29.0013 (max=17.6628) row #192: err=55.0378 (max=27.5013) row #193: err=36.5066 (max=33.2416) row #194: err=22.4157 (max=13.6764) row #195: err=36.426 (max=29.0035) row #196: err=24.4191 (max=22.5652) row #197: err=27.3912 (max=25.9949) row #198: err=0.915223 (max=0.915223) row #199: err=3.60679e-13 (max=2.98261e-13) 494.5201252308407 49.755829752019224 494.5201252308407 49.755829752019224 I attached my test code in this message. Wong Hang <wongh...@gmail.com> 於 2020年2月7日 週五 下午10:49寫道: > Hi all, > > I found that the cholesky factorization unit test no longer works. > The value returned are completely wrong. It looks like a memory error. > I checked if I skip tril call, the value returned by cuSOLVER is correct. > There should be something wrong in libgpuarray > > ====================================================================== > ERROR: test_dense_chol_lower > (theano.gpuarray.tests.test_linalg.TestGpuCholesky64) > ---------------------------------------------------------------------- > Traceback (most recent call last): > File > "/home/wonghang/github/Theano/theano/gpuarray/tests/test_linalg.py", line > 327, in test_dense_chol_lower > self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace) > File > "/home/wonghang/github/Theano/theano/gpuarray/tests/test_linalg.py", line > 280, in compare_gpu_cholesky_to_np > utt.assert_allclose(chol_A_res, chol_A_val) > File "/home/wonghang/github/Theano/theano/tests/unittest_tools.py", line > 358, in assert_allclose > raise WrongValue(expected, value, rtol, atol) > theano.tests.unittest_tools.WrongValue: WrongValue > : shape, dtype, strides, min, max, n_inf, n_nan: > Expected : (3, 3) float64 (24, 8) 1.078578362e-314 1.0548793676823098 0 > 0 > Value : (3, 3) float64 (24, 8) 0.0 1.5121774155893968 0 0 > expected : [[2.00683310e-314 3.46328020e-001 1.07857836e-314] > [2.29026158e-001 1.05487937e+000 4.86725043e-001] > [2.07913268e-001 4.16263205e-001 1.04157477e+000]] > value : [[1.51217742 0. 0. ] > [0.22902616 1.05487937 0. ] > [0.20791327 0.41626321 1.04157477]] > Max Abs Diff: 1.5121774155893968 > Mean Abs Diff: 0.2605811643516005 > Median Abs Diff: 1.078578362e-314 > Std Abs Diff: 0.4752077922970366 > Max Rel Diff: inf > Mean Rel Diff: inf > Median Rel Diff: 1.3335589252099037e-16 > Std Rel Diff: nan > > rtol, atol: 1e-05 1e-08 > > > ====================================================================== > ERROR: test_diag_chol (theano.gpuarray.tests.test_linalg.TestGpuCholesky64) > ---------------------------------------------------------------------- > Traceback (most recent call last): > File > "/home/wonghang/github/Theano/theano/gpuarray/tests/test_linalg.py", line > 317, in test_diag_chol > self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace) > File > "/home/wonghang/github/Theano/theano/gpuarray/tests/test_linalg.py", line > 280, in compare_gpu_cholesky_to_np > utt.assert_allclose(chol_A_res, chol_A_val) > File "/home/wonghang/github/Theano/theano/tests/unittest_tools.py", line > 358, in assert_allclose > raise WrongValue(expected, value, rtol, atol) > theano.tests.unittest_tools.WrongValue: WrongValue > : shape, dtype, strides, min, max, n_inf, n_nan: > Expected : (5, 5) float64 (40, 8) 0.0 1.3969459393428005 0 0 > Value : (5, 5) float64 (40, 8) 0.0 1.3969459393428005 0 0 > expected : [[1.26525335e-314 0.00000000e+000 0.00000000e+000 > 0.00000000e+000 > 0.00000000e+000] > [0.00000000e+000 2.01543086e-314 0.00000000e+000 0.00000000e+000 > 0.00000000e+000] > [0.00000000e+000 0.00000000e+000 1.29480282e+000 0.00000000e+000 > 0.00000000e+000] > [0.00000000e+000 0.00000000e+000 0.00000000e+000 1.31448015e+000 > 0.00000000e+000] > [0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000 > 1.39694594e+000]] > value : [[1.3040081 0. 0. 0. 0. ] > [0. 1.35800834 0. 0. 0. ] > [0. 0. 1.29480282 0. 0. ] > [0. 0. 0. 1.31448015 0. ] > [0. 0. 0. 0. 1.39694594]] > Max Abs Diff: 1.3580083368118308 > Mean Abs Diff: 0.106480657426342 > Median Abs Diff: 0.0 > Std Abs Diff: 0.361174224138967 > Max Rel Diff: nan > Mean Rel Diff: nan > Median Rel Diff: nan > Std Rel Diff: nan > > rtol, atol: 1e-05 1e-08 > > > ---------------------------------------------------------------------- > Ran 40 tests in 12.218s > > FAILED (errors=2, skipped=16) > > Please use the revision 07cd4ad56054c279442ee28413b26939f4c03632 of > libgpuarray > > Use the following command to install an old version of libgpuarray: > > $ git clone https://github.com/Theano/libgpuarray.git > $ cd libgpuarray > $ git checkout 07cd4ad56054c279442ee28413b26939f4c03632 . > $ mkdir cmake > $ cd cmake > $ cmake .. > $ make > $ sudo make install > $ sudo ldconfig > $ cd .. > $ python3 setup.py install > > and then run your theano code again. I think it would work now. > I will check the code in libgpuarray later. Let me raise an issue first. > > Best, > wonghang > > Paul Baggenstoss <p.m.baggenst...@ieee.org> 於 2020年2月7日 週五 下午9:49寫道: > >> Hi wonghang, Sorry to pester you with emails, but I have some >> interesting timing information. >> I ran a process using different processors and ways of computing >> Cholesky() >> The results are surprising. >> >> GpuMagmaCholesky() 9.0 sec >> slinalg.Cholesky(uses cusolver) 2.9 sec >> CPU 1.9 sec >> >> It looks like it pays to just use the CPU! >> >> Doesn't make any sense! >> Paul >> >> >> On Thursday, February 6, 2020 at 2:53:55 PM UTC+1, Paul Baggenstoss wrote: >>> >>> >>> Hello again. >>> So I added 64-bit support to >>> theano/gpuarray/c_code/magma_cholesky.c and to theano/gpuarray/linalg.py in >>> the function GpuMagmaCholesky(). I attached the files. >>> It works now for 32 and 64 bit and has gradient. The numerical problem >>> is gone. >>> But (and this is a big BUT) it iseems to be a factor of at least 2 >>> times slower than the CPU. Any thoughts on this? >>> Paul >>> >>> >>> On Thursday, February 6, 2020 at 10:28:08 AM UTC+1, Paul Baggenstoss >>> wrote: >>>> >>>> Simon, >>>> I did more digging and have some more information. I tested >>>> theano.gpuarray.linalg.GpuMagmaCholesky(), on float32 and it looks good. >>>> The result is exactly the same as for CPU. >>>> So the problem seems to be in CUsolver. The problem is that >>>> theano.gpuarray.linalg.GpuMagmaCholesky()(Cll) does not define a gradient >>>> and works only for >>>> float32. I installed the latest magma-2.5.2 and it has support for >>>> double precision Cholesky (dpotrf) but Theano seems to use it's own copy of >>>> the MAGMA source. >>>> Not sure how that works. Can I force Theano to use magma-2.5.2 ? If >>>> not, it seems feasible to borrow the gradient from >>>> theano.gpuarray.linalg.GpuCholesky() >>>> and add support for float64 as well. Thoughts? >>>> Paul >>>> >>>> >>>> On Wednesday, February 5, 2020 at 5:32:43 PM UTC+1, Paul Baggenstoss >>>> wrote: >>>>> >>>>> Hi Simon, I forgot to mention that I use the gradient of Cholesky, and >>>>> this has even more error than the Cholesky decomo, but I assume that this >>>>> is because >>>>> of a bug in Cholesky itself. >>>>> Paul >>>>> >>>>> >>>>> On Wednesday, February 5, 2020 at 5:30:10 PM UTC+1, Paul Baggenstoss >>>>> wrote: >>>>>> >>>>>> Hi Simon,I have uploaded the MATLAB format file with the matrix Cll, >>>>>> which is the original matrix, and R_cpu which was produced using CPU by >>>>>> slinalg.Cholesky( ), and R_cuda which >>>>>> was produced by the same function, but with GPU ( I think it uses >>>>>> theano.gpuarray.linalg.GpuCholesky() ) Both used the same precision >>>>>> (float32) so should give the same results. >>>>>> But you can see that at the end of the diagonal, the values go wild. >>>>>> It appears to be numericla errors. >>>>>> Thanks in advance! >>>>>> Paul >>>>>> >>>>>> >>>>>> >>>>>> >>>>>> On Wednesday, February 5, 2020 at 4:56:14 PM UTC+1, Wong Hang wrote: >>>>>>> >>>>>>> >>>>>>> Hi, >>>>>>> >>>>>>> The GPU cholesky decomposition relies on cuSOLVER or Magma. I >>>>>>> believe nvidia knows their hardware well and cuSOLVER should provide the >>>>>>> best efficient result. >>>>>>> >>>>>>> Although cholesky decomposition is very numerical stable, when I >>>>>>> write the test case, I find that I will get trouble for relatively small >>>>>>> matrix if I use single-precision. >>>>>>> >>>>>>> Are you using single-precision on a big matrix? >>>>>>> If not, try to compute the condition number of the matrix to see if >>>>>>> it is too big. >>>>>>> >>>>>>> If it is not too big, then it may be a bug. I also need to use the >>>>>>> cholesky operator, Please send me the matrix and I am willing to fix it. >>>>>>> >>>>>>> Best, >>>>>>> >>>>>>> 2020年2月6日(木) 0:34 Paul Baggenstoss <p.m.ba...@ieee.org>: >>>>>>> >>>>>>>> HI Simon, I was wondering if you got anywhere with the faster >>>>>>>> Cholesky for Theano. I also use it a lot and have found it to be >>>>>>>> unstable >>>>>>>> on the GPU. >>>>>>>> Paul >>>>>>>> >>>>>>>> On Saturday, March 7, 2015 at 11:45:36 AM UTC+1, Simon Ebner wrote: >>>>>>>>> >>>>>>>>> Hi all, >>>>>>>>> >>>>>>>>> I want to do computations where I rely heavily on the Cholesky >>>>>>>>> decomposition. Writing a small benchmark for tensor.slinalg.Cholesky, >>>>>>>>> I >>>>>>>>> noticed that the implementation is not as fast as I hoped. As far as >>>>>>>>> I can >>>>>>>>> tell it is not optimized for GPUs yet but relies on the scipy >>>>>>>>> implementation? >>>>>>>>> Doing a bit of a google seach I found several cuda implementations >>>>>>>>> for fast Cholesky decompositions on the GPU. Before I try to include >>>>>>>>> that >>>>>>>>> code into my theano environment, could you let me know whether you >>>>>>>>> decided >>>>>>>>> not to implement fast Cholesky decomposition on the GPU on purpose? >>>>>>>>> Furthermore, since I'm fairly new to theano I'm not completely >>>>>>>>> confident >>>>>>>>> how to incorporate cuda code best into my existing theano code. Is the >>>>>>>>> sensible to create a custom OP with optimized C-Code? >>>>>>>>> >>>>>>>>> Best, >>>>>>>>> Simon >>>>>>>>> >>>>>>>> -- >>>>>>>> >>>>>>>> --- >>>>>>>> You received this message because you are subscribed to the Google >>>>>>>> Groups "theano-users" group. >>>>>>>> To unsubscribe from this group and stop receiving emails from it, >>>>>>>> send an email to theano...@googlegroups.com. >>>>>>>> To view this discussion on the web visit >>>>>>>> https://groups.google.com/d/msgid/theano-users/aca41c35-ec36-4055-bac7-e53aced30ea7%40googlegroups.com >>>>>>>> <https://groups.google.com/d/msgid/theano-users/aca41c35-ec36-4055-bac7-e53aced30ea7%40googlegroups.com?utm_medium=email&utm_source=footer> >>>>>>>> . >>>>>>>> >>>>>>> -- >> >> --- >> You received this message because you are subscribed to the Google Groups >> "theano-users" group. >> To unsubscribe from this group and stop receiving emails from it, send an >> email to theano-users+unsubscr...@googlegroups.com. >> To view this discussion on the web visit >> https://groups.google.com/d/msgid/theano-users/7aac6c1b-4b3b-4ad3-9a1d-1f331e28cf02%40googlegroups.com >> <https://groups.google.com/d/msgid/theano-users/7aac6c1b-4b3b-4ad3-9a1d-1f331e28cf02%40googlegroups.com?utm_medium=email&utm_source=footer> >> . >> > -- --- You received this message because you are subscribed to the Google Groups "theano-users" group. To unsubscribe from this group and stop receiving emails from it, send an email to theano-users+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/theano-users/CAAMb3nWNGCuwaQuC_GE%3DYsGTHmaAARnqgfVge3i4tmTKXxMaOg%40mail.gmail.com.
import numpy as np from numpy import linalg from scipy.io import loadmat import theano import theano.tensor as T import theano.tensor.slinalg as Tla #data = loadmat("test.mat") #cpu_X32 = data['Cll'] #cpu_X64 = cpu_X32.astype(np.float64) np.random.seed(11111) N = 200 L = np.random.normal(size=(N,N)) X = L@L.T cpu_X32 = X.astype(np.float64) cpu_X64 = X gpu_X32 = theano.shared(cpu_X32) gpu_X64 = theano.shared(cpu_X64) gpu_compute32 = theano.function(inputs=[],outputs=Tla.cholesky(gpu_X32)) gpu_compute64 = theano.function(inputs=[],outputs=Tla.cholesky(gpu_X64)) cpu_L = linalg.cholesky(cpu_X64) gpu_X32_L = gpu_compute32() gpu_X64_L = gpu_compute64() for k in range(N): err = gpu_X64_L[k] - cpu_L[k] print("row #%d: err=%g (max=%g)" % (k,linalg.norm(err),np.max(np.abs(err)))) err = gpu_X32_L - cpu_L print(linalg.norm(err),np.max(np.abs(err))) err = gpu_X64_L - cpu_L print(linalg.norm(err),np.max(np.abs(err)))