Coverage for gpaw/kohnsham_layouts.py: 89%

275 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2025-07-14 00:18 +0000

1# Copyright (C) 2010 CAMd 

2# Copyright (C) 2010 Argonne National Laboratory 

3# Please see the accompanying LICENSE file for further information. 

4import numpy as np 

5from scipy.linalg import eigh 

6 

7from gpaw import debug 

8from gpaw.matrix_descriptor import MatrixDescriptor 

9from gpaw.mpi import broadcast_exception 

10from gpaw.blacs import BlacsGrid, Redistributor 

11from gpaw.utilities import uncamelcase 

12from gpaw.utilities.blas import mmm, r2k 

13from gpaw.utilities.scalapack import (pblas_simple_gemm, pblas_tran, 

14 scalapack_tri2full) 

15from gpaw.utilities.tools import tri2full 

16from gpaw.utilities.timing import nulltimer 

17 

18 

19def get_KohnSham_layouts(sl, mode, gd, bd, block_comm, dtype, 

20 elpasolver=None, **kwargs): 

21 """Create Kohn-Sham layouts object.""" 

22 # Not needed for AtomPAW special mode, as usual we just provide whatever 

23 # happens to make the code not crash 

24 if not isinstance(mode, str): 

25 return None # XXX 

26 name = 'OrbitalLayouts' 

27 args = (gd, bd, block_comm, dtype) 

28 if sl is not None: 

29 name = 'Blacs' + name 

30 assert len(sl) == 3 

31 args += tuple(sl) 

32 if elpasolver is not None: 

33 kwargs['elpasolver'] = elpasolver 

34 ksl = {'BlacsOrbitalLayouts': BlacsOrbitalLayouts, 

35 'OrbitalLayouts': OrbitalLayouts}[name](*args, 

36 **kwargs) 

37 return ksl 

38 

39 

40class KohnShamLayouts: 

41 using_blacs = False # This is only used by a regression test 

42 matrix_descriptor_class = None 

43 accepts_decomposed_overlap_matrix = False 

44 

45 def __init__(self, gd, bd, block_comm, dtype, timer=nulltimer): 

46 assert gd.comm.parent is bd.comm.parent # must have same parent comm 

47 self.world = bd.comm.parent 

48 self.gd = gd 

49 self.bd = bd 

50 self.dtype = dtype 

51 self.block_comm = block_comm 

52 self.timer = timer 

53 self._kwargs = {'timer': timer} 

54 

55 if gd.comm.rank == 0: 

56 self.column_comm = bd.comm 

57 else: 

58 self.column_comm = None 

59 

60 def get_keywords(self): 

61 return self._kwargs.copy() # just a shallow copy... 

62 

63 def diagonalize(self, *args, **kwargs): 

64 raise RuntimeError('Virtual member function should not be called.') 

65 

66 def inverse_cholesky(self, *args, **kwargs): 

67 raise RuntimeError('Virtual member function should not be called.') 

68 

69 def new_descriptor(self): 

70 return self.matrix_descriptor_class(self.bd, self.gd, self) 

71 

72 def __repr__(self): 

73 return uncamelcase(self.__class__.__name__) 

74 

75 def get_description(self): 

76 """Description of this object in prose, e.g. for logging. 

77 

78 Subclasses are expected to override this with something useful.""" 

79 return repr(self) 

80 

81 

82class BlacsLayouts(KohnShamLayouts): 

83 using_blacs = True # This is only used by a regression test 

84 

85 def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, 

86 blocksize, timer=nulltimer): 

87 KohnShamLayouts.__init__(self, gd, bd, block_comm, dtype, 

88 timer) 

89 # WARNING: Do not create the BlacsGrid on a communicator which does not 

90 # contain block_comm.rank = 0. This will break BlacsBandLayouts which 

91 # assume eps_M will be broadcast over block_comm. 

92 self.blocksize = blocksize 

93 self.blockgrid = BlacsGrid(self.block_comm, mcpus, ncpus) 

94 

95 def get_description(self): 

96 title = 'BLACS' 

97 template = '%d x %d grid with %d x %d blocksize' 

98 return (title, template) 

99 

100 

101class BlacsOrbitalLayouts(BlacsLayouts): 

102 """ScaLAPACK Dense Linear Algebra. 

103 

104 This class is instantiated in LCAO. Not for casual use, at least for now. 

105 

106 Requires two distributors and three descriptors for initialization 

107 as well as grid descriptors and band descriptors. Distributors are 

108 for cols2blocks (1D -> 2D BLACS grid) and blocks2cols (2D -> 1D 

109 BLACS grid). ScaLAPACK operations must occur on 2D BLACS grid for 

110 performance and scalability. 

111 

112 _general_diagonalize is "hard-coded" for LCAO. 

113 Expects both Hamiltonian and Overlap matrix to be on the 2D BLACS grid. 

114 This is done early on to save memory. 

115 """ 

116 # XXX rewrite this docstring a bit! 

117 

118 # This class 'describes' all the LCAO Blacs-related layouts 

119 def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus, 

120 blocksize, nao, elpasolver=None, timer=nulltimer): 

121 BlacsLayouts.__init__(self, gd, bd, block_comm, dtype, 

122 mcpus, ncpus, blocksize, timer) 

123 nbands = bd.nbands 

124 self.blocksize = blocksize 

125 

126 self.orbital_comm = self.bd.comm 

127 self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size) 

128 self.nao = nao 

129 

130 # Range of basis functions for BLACS distribution of matrices: 

131 self.Mmax = nao 

132 self.Mstart = min(bd.comm.rank * naoblocksize, self.Mmax) 

133 self.Mstop = min(self.Mstart + naoblocksize, self.Mmax) 

134 self.mynao = self.Mstop - self.Mstart 

135 

136 # Column layout for one matrix per band rank: 

137 self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1) 

138 self.mMdescriptor = self.columngrid.new_descriptor(nao, nao, 

139 naoblocksize, nao) 

140 self.nMdescriptor = self.columngrid.new_descriptor(nbands, nao, 

141 bd.maxmynbands, nao) 

142 

143 # parallelprint(world, (mynao, self.mMdescriptor.shape)) 

144 

145 # Column layout for one matrix in total (only on grid masters): 

146 self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1) 

147 self.mM_unique_descriptor = self.single_column_grid.new_descriptor( 

148 nao, nao, naoblocksize, nao) 

149 

150 # nM_unique_descriptor is meant to hold the coefficients after 

151 # diagonalization. BLACS requires it to be nao-by-nao, but 

152 # we only fill meaningful data into the first nbands columns. 

153 # 

154 # The array will then be trimmed and broadcast across 

155 # the grid descriptor's communicator. 

156 self.nM_unique_descriptor = self.single_column_grid.new_descriptor( 

157 nbands, nao, bd.maxmynbands, nao) 

158 

159 # Fully blocked grid for diagonalization with many CPUs: 

160 self.mmdescriptor = self.blockgrid.new_descriptor(nao, nao, blocksize, 

161 blocksize) 

162 

163 # self.nMdescriptor = nMdescriptor 

164 self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor, 

165 self.mmdescriptor) 

166 self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor, 

167 self.nM_unique_descriptor) 

168 

169 self.libelpa = None 

170 if elpasolver is not None: 

171 # XXX forward solver to libelpa 

172 from gpaw.utilities.elpa import LibElpa 

173 self.libelpa = LibElpa(self.mmdescriptor, solver=elpasolver, 

174 nev=nbands) 

175 

176 @property 

177 def accepts_decomposed_overlap_matrix(self): 

178 return self.libelpa is not None 

179 

180 def diagonalize(self, H_mm, C_nM, eps_n, S_mm, 

181 is_already_decomposed=False): 

182 # C_nM needs to be simultaneously compatible with: 

183 # 1. outdescriptor 

184 # 2. broadcast with gd.comm 

185 # We will does this with a dummy buffer C2_nM 

186 outdescriptor = self.mm2nM.dstdescriptor # blocks2cols 

187 blockdescriptor = self.mM2mm.dstdescriptor # cols2blocks 

188 

189 dtype = S_mm.dtype 

190 eps_M = np.empty(C_nM.shape[-1]) # empty helps us debug 

191 subM, subN = outdescriptor.gshape 

192 C_mm = blockdescriptor.zeros(dtype=dtype) 

193 

194 self.timer.start('General diagonalize') 

195 # general_diagonalize_ex may have a buffer overflow, so 

196 # we no longer use it 

197 # blockdescriptor.general_diagonalize_ex(H_mm, S_mm.copy(), C_mm, 

198 # eps_M, 

199 # UL='L', iu=self.bd.nbands) 

200 if self.libelpa is not None: 

201 assert blockdescriptor is self.libelpa.desc 

202 scalapack_tri2full(blockdescriptor, H_mm) 

203 

204 # elpa will write decomposed form of S_mm into S_mm. 

205 # Other KSL diagonalization functions do *not* overwrite S_mm. 

206 self.libelpa.general_diagonalize( 

207 H_mm, S_mm, C_mm, eps_M[:self.bd.nbands], 

208 is_already_decomposed=is_already_decomposed) 

209 else: 

210 blockdescriptor.general_diagonalize_dc(H_mm, S_mm.copy(), C_mm, 

211 eps_M, UL='L') 

212 self.timer.stop('General diagonalize') 

213 

214 # Make C_nM compatible with the redistributor 

215 self.timer.start('Redistribute coefs') 

216 if outdescriptor: 

217 C2_nM = C_nM 

218 else: 

219 C2_nM = outdescriptor.empty(dtype=dtype) 

220 assert outdescriptor.check(C2_nM) 

221 self.mm2nM.redistribute(C_mm, C2_nM, subM, subN) # blocks2cols 

222 self.timer.stop('Redistribute coefs') 

223 

224 self.timer.start('Send coefs to domains') 

225 # eps_M is already on block_comm.rank = 0 

226 # easier to broadcast eps_M to all and 

227 # get the correct slice afterward. 

228 self.block_comm.broadcast(eps_M, 0) 

229 eps_n[:] = eps_M[self.bd.get_slice()] 

230 self.gd.comm.broadcast(C_nM, 0) 

231 self.timer.stop('Send coefs to domains') 

232 

233 def distribute_overlap_matrix(self, S_qmM, root=0, 

234 add_hermitian_conjugate=False): 

235 # Some MPI implementations need a lot of memory to do large 

236 # reductions. To avoid trouble, we do comm.sum on smaller blocks 

237 # of S (this code is also safe for arrays smaller than blocksize) 

238 Sflat_x = S_qmM.ravel() 

239 blocksize = 2**23 // Sflat_x.itemsize # 8 MiB 

240 nblocks = -(-len(Sflat_x) // blocksize) 

241 Mstart = 0 

242 self.timer.start('blocked summation') 

243 for i in range(nblocks): 

244 self.gd.comm.sum(Sflat_x[Mstart:Mstart + blocksize], root=root) 

245 Mstart += blocksize 

246 assert Mstart + blocksize >= len(Sflat_x) 

247 self.timer.stop('blocked summation') 

248 

249 xshape = S_qmM.shape[:-2] 

250 if len(xshape) == 0: 

251 S_qmM = S_qmM[np.newaxis] 

252 assert S_qmM.ndim == 3 

253 

254 blockdesc = self.mmdescriptor 

255 coldesc = self.mM_unique_descriptor 

256 S_qmm = blockdesc.zeros(len(S_qmM), S_qmM.dtype) 

257 

258 if not coldesc: # XXX ugly way to sort out inactive ranks 

259 S_qmM = coldesc.zeros(len(S_qmM), S_qmM.dtype) 

260 

261 self.timer.start('Scalapack redistribute') 

262 for S_mM, S_mm in zip(S_qmM, S_qmm): 

263 self.mM2mm.redistribute(S_mM, S_mm) 

264 if add_hermitian_conjugate: 

265 if blockdesc.active: 

266 pblas_tran(1.0, S_mm.copy(), 1.0, S_mm, 

267 blockdesc, blockdesc) 

268 

269 if self.libelpa is not None: 

270 # Elpa needs the full H_mm, but apparently does not 

271 # need the full S_mm. However that fact is not documented, 

272 # and hence we stay on the safe side, tri2full-ing the 

273 # matrix. 

274 scalapack_tri2full(blockdesc, S_mm) 

275 

276 self.timer.stop('Scalapack redistribute') 

277 return S_qmm.reshape(xshape + blockdesc.shape) 

278 

279 def get_overlap_matrix_shape(self): 

280 return self.mmdescriptor.shape 

281 

282 def calculate_blocked_density_matrix(self, f_n, C_nM): 

283 nbands = self.bd.nbands 

284 nao = self.nao 

285 dtype = C_nM.dtype 

286 

287 self.nMdescriptor.checkassert(C_nM) 

288 if self.gd.rank == 0: 

289 Cf_nM = (C_nM * f_n[:, None]) 

290 else: 

291 C_nM = self.nM_unique_descriptor.zeros(dtype=dtype) 

292 Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype) 

293 

294 r = Redistributor(self.block_comm, self.nM_unique_descriptor, 

295 self.mmdescriptor) 

296 

297 Cf_mm = self.mmdescriptor.zeros(dtype=dtype) 

298 r.redistribute(Cf_nM, Cf_mm, nbands, nao) 

299 del Cf_nM 

300 

301 C_mm = self.mmdescriptor.zeros(dtype=dtype) 

302 r.redistribute(C_nM, C_mm, nbands, nao) 

303 # no use to delete C_nM as it's in the input... 

304 

305 rho_mm = self.mmdescriptor.zeros(dtype=dtype) 

306 

307 if 1: # if self.libelpa is None: 

308 pblas_simple_gemm(self.mmdescriptor, 

309 self.mmdescriptor, 

310 self.mmdescriptor, 

311 Cf_mm, C_mm, rho_mm, transa='C') 

312 else: 

313 # elpa_hermitian_multiply was not faster than the ordinary 

314 # multiplication in the test. The way we have things distributed, 

315 # we need to transpose things at the moment. 

316 # 

317 # Rather than enabling this, we should store the coefficients 

318 # in an appropriate 2D block cyclic format (c_nm) and not the 

319 # current C_nM format. This makes it possible to avoid 

320 # redistributing the coefficients at all. But we don't have time 

321 # to implement this at the moment. 

322 mul = self.libelpa.hermitian_multiply 

323 desc = self.mmdescriptor 

324 from gpaw.utilities.scalapack import pblas_tran 

325 

326 def T(array): 

327 tmp = array.copy() 

328 pblas_tran(alpha=1.0, a_MN=tmp, 

329 beta=0.0, c_NM=array, 

330 desca=desc, descc=desc) 

331 T(C_mm) 

332 T(Cf_mm) 

333 mul(C_mm, Cf_mm, rho_mm, 

334 desc, desc, desc, 

335 uplo_a='X', uplo_c='X') 

336 

337 return rho_mm 

338 

339 def calculate_density_matrix(self, f_n, C_nM, rho_mM=None): 

340 """Calculate density matrix from occupations and coefficients. 

341 

342 Presently this function performs the usual scalapack 3-step trick: 

343 redistribute-numbercrunching-backdistribute. 

344 

345 

346 Notes on future performance improvement. 

347 

348 As per the current framework, C_nM exists as copies on each 

349 domain, i.e. this is not parallel over domains. We'd like to 

350 correct this and have an efficient distribution using e.g. the 

351 block communicator. 

352 

353 The diagonalization routine and other parts of the code should 

354 however be changed to accommodate the following scheme: 

355 

356 Keep coefficients in C_mm form after the diagonalization. 

357 rho_mm can then be directly calculated from C_mm without 

358 redistribution, after which we only need to redistribute 

359 rho_mm across domains. 

360 

361 """ 

362 dtype = C_nM.dtype 

363 rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM) 

364 rback = Redistributor(self.block_comm, self.mmdescriptor, 

365 self.mM_unique_descriptor) 

366 rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype) 

367 rback.redistribute(rho_mm, rho1_mM) 

368 del rho_mm 

369 

370 if rho_mM is None: 

371 if self.gd.rank == 0: 

372 rho_mM = rho1_mM 

373 else: 

374 rho_mM = self.mMdescriptor.zeros(dtype=dtype) 

375 

376 self.gd.comm.broadcast(rho_mM, 0) 

377 return rho_mM 

378 

379 def distribute_to_columns(self, rho_mm, srcdescriptor): 

380 redistributor = Redistributor(self.block_comm, # XXX 

381 srcdescriptor, 

382 self.mM_unique_descriptor) 

383 rho_mM = redistributor.redistribute(rho_mm) 

384 if self.gd.rank != 0: 

385 rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype) 

386 self.gd.comm.broadcast(rho_mM, 0) 

387 return rho_mM 

388 

389 def oldcalculate_density_matrix(self, f_n, C_nM, rho_mM=None): 

390 # This version is parallel over the band descriptor only. 

391 # This is inefficient, but let's keep it for a while in case 

392 # there's trouble with the more efficient version 

393 if rho_mM is None: 

394 rho_mM = self.mMdescriptor.zeros(dtype=C_nM.dtype) 

395 

396 Cf_nM = (C_nM * f_n[:, None]).conj() 

397 pblas_simple_gemm(self.nMdescriptor, self.nMdescriptor, 

398 self.mMdescriptor, Cf_nM, C_nM, rho_mM, transa='T') 

399 return rho_mM 

400 

401 def get_transposed_density_matrix(self, f_n, C_nM, rho_mM=None): 

402 return self.calculate_density_matrix(f_n, C_nM, rho_mM).conj() 

403 

404 def get_description(self): 

405 (title, template) = BlacsLayouts.get_description(self) 

406 bg = self.blockgrid 

407 desc = self.mmdescriptor 

408 s = template % (bg.nprow, bg.npcol, desc.mb, desc.nb) 

409 if self.libelpa is not None: 

410 solver = self.libelpa.description 

411 else: 

412 solver = 'ScaLAPACK' 

413 return ''.join([title, ' / ', solver, ', ', s]) 

414 

415 

416class OrbitalLayouts(KohnShamLayouts): 

417 def __init__(self, gd, bd, block_comm, dtype, nao, 

418 timer=nulltimer): 

419 KohnShamLayouts.__init__(self, gd, bd, block_comm, dtype, 

420 timer) 

421 self.mMdescriptor = MatrixDescriptor(nao, nao) 

422 self.nMdescriptor = MatrixDescriptor(bd.mynbands, nao) 

423 

424 self.Mstart = 0 

425 self.Mstop = nao 

426 self.Mmax = nao 

427 self.mynao = nao 

428 self.nao = nao 

429 self.orbital_comm = bd.comm 

430 

431 def diagonalize(self, H_MM, C_nM, eps_n, S_MM, 

432 overwrite_S=False): 

433 assert not overwrite_S 

434 eps_M = np.empty(C_nM.shape[-1]) 

435 self.block_comm.broadcast(H_MM, 0) 

436 self.block_comm.broadcast(S_MM, 0) 

437 # The result on different processor is not necessarily bit-wise 

438 # identical, so only domain master performs computation 

439 with broadcast_exception(self.gd.comm): 

440 if self.gd.comm.rank == 0: 

441 self._diagonalize(H_MM, S_MM.copy(), eps_M) 

442 self.gd.comm.broadcast(H_MM, 0) 

443 self.gd.comm.broadcast(eps_M, 0) 

444 eps_n[:] = eps_M[self.bd.get_slice()] 

445 C_nM[:] = H_MM[self.bd.get_slice()] 

446 

447 def _diagonalize(self, H_MM, S_MM, eps_M): 

448 """Serial diagonalize via LAPACK.""" 

449 # This is replicated computation but ultimately avoids 

450 # additional communication 

451 if eps_M.size == 0: 

452 return 

453 eps_M[:], H_MM.T[:] = eigh(H_MM, S_MM, 

454 overwrite_a=True, 

455 check_finite=debug) 

456 if H_MM.dtype == complex: 

457 np.negative(H_MM.imag, H_MM.imag) 

458 

459 def estimate_memory(self, mem, dtype): 

460 itemsize = mem.itemsize[dtype] 

461 mem.subnode('eps [M]', self.nao * mem.floatsize) 

462 mem.subnode('H [MM]', self.nao * self.nao * itemsize) 

463 

464 def distribute_overlap_matrix(self, S_qMM, root=0, 

465 add_hermitian_conjugate=False): 

466 self.gd.comm.sum(S_qMM, root) 

467 if add_hermitian_conjugate: 

468 S_qMM += S_qMM.swapaxes(-1, -2).conj() 

469 return S_qMM 

470 

471 def get_overlap_matrix_shape(self): 

472 return self.nao, self.nao 

473 

474 def calculate_density_matrix(self, f_n, C_nM, rho_MM=None, C2_nM=None): 

475 # Only a madman would use a non-transposed density matrix. 

476 # Maybe we should use the get_transposed_density_matrix instead 

477 if rho_MM is None: 

478 rho_MM = np.zeros((self.mynao, self.nao), dtype=C_nM.dtype) 

479 # XXX Should not conjugate, but call gemm(..., 'c') 

480 # Although that requires knowing C_Mn and not C_nM. 

481 # that also conforms better to the usual conventions in literature 

482 

483 # Use only occupied bands to construct density matrix 

484 occupied = abs(f_n) > 1.0e-10 

485 if C2_nM is None: 

486 C2_nM = C_nM 

487 Cf_Mn = \ 

488 np.ascontiguousarray(C2_nM[occupied].T.conj() * f_n[occupied]) 

489 # gemm(1.0, C_nM[occupied], Cf_Mn, 0.0, rho_MM, 'n') 

490 mmm(1.0, Cf_Mn, 'N', C_nM[occupied], 'N', 0.0, rho_MM) 

491 return rho_MM 

492 

493 def get_transposed_density_matrix(self, f_n, C_nM, rho_MM=None): 

494 return self.calculate_density_matrix(f_n, C_nM, rho_MM).T.copy() 

495 

496 # if rho_MM is None: 

497 # rho_MM = np.zeros((self.mynao, self.nao), dtype=C_nM.dtype) 

498 # C_Mn = C_nM.T.copy() 

499 # gemm(1.0, C_Mn, f_n[np.newaxis, :] * C_Mn, 0.0, rho_MM, 'c') 

500 # self.bd.comm.sum(rho_MM) 

501 # return rho_MM 

502 

503 def alternative_calculate_density_matrix(self, f_n, C_nM, rho_MM=None): 

504 if rho_MM is None: 

505 rho_MM = np.zeros((self.mynao, self.nao), dtype=C_nM.dtype) 

506 # Alternative suggestion. Might be faster. Someone should test this 

507 C_Mn = C_nM.T.copy() 

508 r2k(0.5, C_Mn, f_n * C_Mn, 0.0, rho_MM) 

509 tri2full(rho_MM) 

510 return rho_MM 

511 

512 def get_description(self): 

513 return 'Serial LAPACK' 

514 

515 def calculate_density_matrix_delta(self, d_nn, C_nM, rho_MM=None): 

516 # Only a madman would use a non-transposed density matrix. 

517 # Maybe we should use the get_transposed_density_matrix instead 

518 if rho_MM is None: 

519 rho_MM = np.zeros((self.mynao, self.nao), dtype=C_nM.dtype) 

520 Cd_Mn = np.zeros((self.nao, self.bd.mynbands), dtype=C_nM.dtype) 

521 # XXX Should not conjugate, but call gemm(..., 'c') 

522 # Although that requires knowing C_Mn and not C_nM. 

523 # that also conforms better to the usual conventions in literature 

524 C_Mn = C_nM.T.conj().copy() 

525 # gemm(1.0, d_nn, C_Mn, 0.0, Cd_Mn, 'n') 

526 # gemm(1.0, C_nM, Cd_Mn, 0.0, rho_MM, 'n') 

527 mmm(1.0, C_Mn, 'N', d_nn, 'N', 0.0, Cd_Mn) 

528 mmm(1.0, Cd_Mn, 'N', C_nM, 'N', 0.0, rho_MM) 

529 self.bd.comm.sum(rho_MM) 

530 return rho_MM 

531 

532 def get_transposed_density_matrix_delta(self, d_nn, C_nM, rho_MM=None): 

533 return self.calculate_density_matrix_delta(d_nn, C_nM, rho_MM).T.copy()