I would use option 3. Keep a work vector and do a vector
summation instead of the multiple multiplication by scale
and 1/scale.
I agree with you the docs are a little misleading here.
On Thu, Aug 10, 2023, 11:40 Niclas Götting
<ngoett...@itp.uni-bremen.de> wrote:
Thank you both for the very quick answer!
So far, I compiled PETSc with debugging turned on, but I
think it should still be faster than standard scipy in
both cases. Actually, Stefano's answer has got me very
far already; now I only define the RHS of the ODE and no
Jacobian (I wonder, why the documentation suggests
otherwise, though). I had the following four tries at
implementing the RHS:
1. def rhsfunc1(ts, t, u, F):
scale = 0.5 * (5 < t < 10)
(l + scale * pump).mult(u, F)
2. def rhsfunc2(ts, t, u, F):
l.mult(u, F)
scale = 0.5 * (5 < t < 10)
(scale * pump).multAdd(u, F, F)
3. def rhsfunc3(ts, t, u, F):
l.mult(u, F)
scale = 0.5 * (5 < t < 10)
if scale != 0:
pump.scale(scale)
pump.multAdd(u, F, F)
pump.scale(1/scale)
4. def rhsfunc4(ts, t, u, F):
tmp_pump.zeroEntries() # tmp_pump is
pump.duplicate()
l.mult(u, F)
scale = 0.5 * (5 < t < 10)
tmp_pump.axpy(scale, pump,
structure=PETSc.Mat.Structure.SAME_NONZERO_PATTERN)
tmp_pump.multAdd(u, F, F)
They all yield the same results, but with 50it/s,
800it/, 2300it/s and 1900it/s, respectively, which is a
huge performance boost (almost 7 times as fast as scipy,
with PETSc debugging still turned on). As the scale
function will most likely be a gaussian in the future, I
think that option 3 will be become numerically unstable
and I'll have to go with option 4, which is already
faster than I expected. If you think it is possible to
speed up the RHS calculation even more, I'd be happy to
hear your suggestions; the -log_view is attached to this
message.
One last point: If I didn't misunderstand the
documentation at
https://petsc.org/release/manual/ts/#special-cases,
should this maybe be changed?
Best regards
Niclas
On 09.08.23 17:51, Stefano Zampini wrote:
TSRK is an explicit solver. Unless you are changing the
ts type from command line, the explicit jacobian
should not be needed. On top of Barry's suggestion, I
would suggest you to write the explicit RHS instead of
assembly a throw away matrix every time that function
needs to be sampled.
On Wed, Aug 9, 2023, 17:09 Niclas Götting
<ngoett...@itp.uni-bremen.de> wrote:
Hi all,
I'm currently trying to convert a quantum
simulation from scipy to
PETSc. The problem itself is extremely simple and
of the form \dot{u}(t)
= (A_const + f(t)*B_const)*u(t), where f(t) in this
simple test case is
a square function. The matrices A_const and B_const
are extremely sparse
and therefore I thought, the problem will be well
suited for PETSc.
Currently, I solve the ODE with the following
procedure in scipy (I can
provide the necessary data files, if needed, but
they are just some
trace-preserving, very sparse matrices):
import numpy as np
import scipy.sparse
import scipy.integrate
from tqdm import tqdm
l = np.load("../liouvillian.npy")
pump = np.load("../pump_operator.npy")
state = np.load("../initial_state.npy")
l = scipy.sparse.csr_array(l)
pump = scipy.sparse.csr_array(pump)
def f(t, y, *args):
return (l + 0.5 * (5 < t < 10) * pump) @ y
#return l @ y # Uncomment for f(t) = 0
dt = 0.1
NUM_STEPS = 200
res = np.empty((NUM_STEPS, 4096), dtype=np.complex128)
solver =
scipy.integrate.ode(f).set_integrator("zvode").set_initial_value(state)
times = []
for i in tqdm(range(NUM_STEPS)):
res[i, :] = solver.integrate(solver.t + dt)
times.append(solver.t)
Here, A_const = l, B_const = pump and f(t) = 5 < t
< 10. tqdm reports
about 330it/s on my machine. When converting the
code to PETSc, I came
to the following result (according to the chapter
https://petsc.org/main/manual/ts/#special-cases)
import sys
import petsc4py
petsc4py.init(args=sys.argv)
import numpy as np
import scipy.sparse
from tqdm import tqdm
from petsc4py import PETSc
comm = PETSc.COMM_WORLD
def mat_to_real(arr):
return np.block([[arr.real, -arr.imag],
[arr.imag,
arr.real]]).astype(np.float64)
def mat_to_petsc_aij(arr):
arr_sc_sp = scipy.sparse.csr_array(arr)
mat = PETSc.Mat().createAIJ(arr.shape[0],
comm=comm)
rstart, rend = mat.getOwnershipRange()
print(rstart, rend)
print(arr.shape[0])
print(mat.sizes)
I = arr_sc_sp.indptr[rstart : rend + 1] -
arr_sc_sp.indptr[rstart]
J = arr_sc_sp.indices[arr_sc_sp.indptr[rstart] :
arr_sc_sp.indptr[rend]]
V = arr_sc_sp.data[arr_sc_sp.indptr[rstart] :
arr_sc_sp.indptr[rend]]
print(I.shape, J.shape, V.shape)
mat.setValuesCSR(I, J, V)
mat.assemble()
return mat
l = np.load("../liouvillian.npy")
l = mat_to_real(l)
pump = np.load("../pump_operator.npy")
pump = mat_to_real(pump)
state = np.load("../initial_state.npy")
state = np.hstack([state.real,
state.imag]).astype(np.float64)
l = mat_to_petsc_aij(l)
pump = mat_to_petsc_aij(pump)
jac = l.duplicate()
for i in range(8192):
jac.setValue(i, i, 0)
jac.assemble()
jac += l
vec = l.createVecRight()
vec.setValues(np.arange(state.shape[0],
dtype=np.int32), state)
vec.assemble()
dt = 0.1
ts = PETSc.TS().create(comm=comm)
ts.setFromOptions()
ts.setProblemType(ts.ProblemType.LINEAR)
ts.setEquationType(ts.EquationType.ODE_EXPLICIT)
ts.setType(ts.Type.RK)
ts.setRKType(ts.RKType.RK3BS)
ts.setTime(0)
print("KSP:", ts.getKSP().getType())
print("KSP PC:",ts.getKSP().getPC().getType())
print("SNES :", ts.getSNES().getType())
def jacobian(ts, t, u, Amat, Pmat):
Amat.zeroEntries()
Amat.aypx(1, l,
structure=PETSc.Mat.Structure.SUBSET_NONZERO_PATTERN)
Amat.axpy(0.5 * (5 < t < 10), pump,
structure=PETSc.Mat.Structure.SUBSET_NONZERO_PATTERN)
ts.setRHSFunction(PETSc.TS.computeRHSFunctionLinear)
#ts.setRHSJacobian(PETSc.TS.computeRHSJacobianConstant,
l, l) #
Uncomment for f(t) = 0
ts.setRHSJacobian(jacobian, jac)
NUM_STEPS = 200
res = np.empty((NUM_STEPS, 8192), dtype=np.float64)
times = []
rstart, rend = vec.getOwnershipRange()
for i in tqdm(range(NUM_STEPS)):
time = ts.getTime()
ts.setMaxTime(time + dt)
ts.solve(vec)
res[i, rstart:rend] = vec.getArray()[:]
times.append(time)
I decomposed the complex ODE into a larger real
ODE, so that I can
easily switch maybe to GPU computation later on.
Now, the solutions of
both scripts are very much identical, but PETSc
runs about 3 times
slower at 120it/s on my machine. I don't use MPI
for PETSc yet.
I strongly suppose that the problem lies within the
jacobian definition,
as PETSc is about 3 times *faster* than scipy with
f(t) = 0 and
therefore a constant jacobian.
Thank you in advance.
All the best,
Niclas