Coverage for gpaw/kohnsham_layouts.py: 89%
275 statements
« prev ^ index » next coverage.py v7.7.1, created at 2025-07-14 00:18 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2025-07-14 00:18 +0000
1# Copyright (C) 2010 CAMd
2# Copyright (C) 2010 Argonne National Laboratory
3# Please see the accompanying LICENSE file for further information.
4import numpy as np
5from scipy.linalg import eigh
7from gpaw import debug
8from gpaw.matrix_descriptor import MatrixDescriptor
9from gpaw.mpi import broadcast_exception
10from gpaw.blacs import BlacsGrid, Redistributor
11from gpaw.utilities import uncamelcase
12from gpaw.utilities.blas import mmm, r2k
13from gpaw.utilities.scalapack import (pblas_simple_gemm, pblas_tran,
14 scalapack_tri2full)
15from gpaw.utilities.tools import tri2full
16from gpaw.utilities.timing import nulltimer
19def get_KohnSham_layouts(sl, mode, gd, bd, block_comm, dtype,
20 elpasolver=None, **kwargs):
21 """Create Kohn-Sham layouts object."""
22 # Not needed for AtomPAW special mode, as usual we just provide whatever
23 # happens to make the code not crash
24 if not isinstance(mode, str):
25 return None # XXX
26 name = 'OrbitalLayouts'
27 args = (gd, bd, block_comm, dtype)
28 if sl is not None:
29 name = 'Blacs' + name
30 assert len(sl) == 3
31 args += tuple(sl)
32 if elpasolver is not None:
33 kwargs['elpasolver'] = elpasolver
34 ksl = {'BlacsOrbitalLayouts': BlacsOrbitalLayouts,
35 'OrbitalLayouts': OrbitalLayouts}[name](*args,
36 **kwargs)
37 return ksl
40class KohnShamLayouts:
41 using_blacs = False # This is only used by a regression test
42 matrix_descriptor_class = None
43 accepts_decomposed_overlap_matrix = False
45 def __init__(self, gd, bd, block_comm, dtype, timer=nulltimer):
46 assert gd.comm.parent is bd.comm.parent # must have same parent comm
47 self.world = bd.comm.parent
48 self.gd = gd
49 self.bd = bd
50 self.dtype = dtype
51 self.block_comm = block_comm
52 self.timer = timer
53 self._kwargs = {'timer': timer}
55 if gd.comm.rank == 0:
56 self.column_comm = bd.comm
57 else:
58 self.column_comm = None
60 def get_keywords(self):
61 return self._kwargs.copy() # just a shallow copy...
63 def diagonalize(self, *args, **kwargs):
64 raise RuntimeError('Virtual member function should not be called.')
66 def inverse_cholesky(self, *args, **kwargs):
67 raise RuntimeError('Virtual member function should not be called.')
69 def new_descriptor(self):
70 return self.matrix_descriptor_class(self.bd, self.gd, self)
72 def __repr__(self):
73 return uncamelcase(self.__class__.__name__)
75 def get_description(self):
76 """Description of this object in prose, e.g. for logging.
78 Subclasses are expected to override this with something useful."""
79 return repr(self)
82class BlacsLayouts(KohnShamLayouts):
83 using_blacs = True # This is only used by a regression test
85 def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus,
86 blocksize, timer=nulltimer):
87 KohnShamLayouts.__init__(self, gd, bd, block_comm, dtype,
88 timer)
89 # WARNING: Do not create the BlacsGrid on a communicator which does not
90 # contain block_comm.rank = 0. This will break BlacsBandLayouts which
91 # assume eps_M will be broadcast over block_comm.
92 self.blocksize = blocksize
93 self.blockgrid = BlacsGrid(self.block_comm, mcpus, ncpus)
95 def get_description(self):
96 title = 'BLACS'
97 template = '%d x %d grid with %d x %d blocksize'
98 return (title, template)
101class BlacsOrbitalLayouts(BlacsLayouts):
102 """ScaLAPACK Dense Linear Algebra.
104 This class is instantiated in LCAO. Not for casual use, at least for now.
106 Requires two distributors and three descriptors for initialization
107 as well as grid descriptors and band descriptors. Distributors are
108 for cols2blocks (1D -> 2D BLACS grid) and blocks2cols (2D -> 1D
109 BLACS grid). ScaLAPACK operations must occur on 2D BLACS grid for
110 performance and scalability.
112 _general_diagonalize is "hard-coded" for LCAO.
113 Expects both Hamiltonian and Overlap matrix to be on the 2D BLACS grid.
114 This is done early on to save memory.
115 """
116 # XXX rewrite this docstring a bit!
118 # This class 'describes' all the LCAO Blacs-related layouts
119 def __init__(self, gd, bd, block_comm, dtype, mcpus, ncpus,
120 blocksize, nao, elpasolver=None, timer=nulltimer):
121 BlacsLayouts.__init__(self, gd, bd, block_comm, dtype,
122 mcpus, ncpus, blocksize, timer)
123 nbands = bd.nbands
124 self.blocksize = blocksize
126 self.orbital_comm = self.bd.comm
127 self.naoblocksize = naoblocksize = -((-nao) // self.orbital_comm.size)
128 self.nao = nao
130 # Range of basis functions for BLACS distribution of matrices:
131 self.Mmax = nao
132 self.Mstart = min(bd.comm.rank * naoblocksize, self.Mmax)
133 self.Mstop = min(self.Mstart + naoblocksize, self.Mmax)
134 self.mynao = self.Mstop - self.Mstart
136 # Column layout for one matrix per band rank:
137 self.columngrid = BlacsGrid(bd.comm, bd.comm.size, 1)
138 self.mMdescriptor = self.columngrid.new_descriptor(nao, nao,
139 naoblocksize, nao)
140 self.nMdescriptor = self.columngrid.new_descriptor(nbands, nao,
141 bd.maxmynbands, nao)
143 # parallelprint(world, (mynao, self.mMdescriptor.shape))
145 # Column layout for one matrix in total (only on grid masters):
146 self.single_column_grid = BlacsGrid(self.column_comm, bd.comm.size, 1)
147 self.mM_unique_descriptor = self.single_column_grid.new_descriptor(
148 nao, nao, naoblocksize, nao)
150 # nM_unique_descriptor is meant to hold the coefficients after
151 # diagonalization. BLACS requires it to be nao-by-nao, but
152 # we only fill meaningful data into the first nbands columns.
153 #
154 # The array will then be trimmed and broadcast across
155 # the grid descriptor's communicator.
156 self.nM_unique_descriptor = self.single_column_grid.new_descriptor(
157 nbands, nao, bd.maxmynbands, nao)
159 # Fully blocked grid for diagonalization with many CPUs:
160 self.mmdescriptor = self.blockgrid.new_descriptor(nao, nao, blocksize,
161 blocksize)
163 # self.nMdescriptor = nMdescriptor
164 self.mM2mm = Redistributor(self.block_comm, self.mM_unique_descriptor,
165 self.mmdescriptor)
166 self.mm2nM = Redistributor(self.block_comm, self.mmdescriptor,
167 self.nM_unique_descriptor)
169 self.libelpa = None
170 if elpasolver is not None:
171 # XXX forward solver to libelpa
172 from gpaw.utilities.elpa import LibElpa
173 self.libelpa = LibElpa(self.mmdescriptor, solver=elpasolver,
174 nev=nbands)
176 @property
177 def accepts_decomposed_overlap_matrix(self):
178 return self.libelpa is not None
180 def diagonalize(self, H_mm, C_nM, eps_n, S_mm,
181 is_already_decomposed=False):
182 # C_nM needs to be simultaneously compatible with:
183 # 1. outdescriptor
184 # 2. broadcast with gd.comm
185 # We will does this with a dummy buffer C2_nM
186 outdescriptor = self.mm2nM.dstdescriptor # blocks2cols
187 blockdescriptor = self.mM2mm.dstdescriptor # cols2blocks
189 dtype = S_mm.dtype
190 eps_M = np.empty(C_nM.shape[-1]) # empty helps us debug
191 subM, subN = outdescriptor.gshape
192 C_mm = blockdescriptor.zeros(dtype=dtype)
194 self.timer.start('General diagonalize')
195 # general_diagonalize_ex may have a buffer overflow, so
196 # we no longer use it
197 # blockdescriptor.general_diagonalize_ex(H_mm, S_mm.copy(), C_mm,
198 # eps_M,
199 # UL='L', iu=self.bd.nbands)
200 if self.libelpa is not None:
201 assert blockdescriptor is self.libelpa.desc
202 scalapack_tri2full(blockdescriptor, H_mm)
204 # elpa will write decomposed form of S_mm into S_mm.
205 # Other KSL diagonalization functions do *not* overwrite S_mm.
206 self.libelpa.general_diagonalize(
207 H_mm, S_mm, C_mm, eps_M[:self.bd.nbands],
208 is_already_decomposed=is_already_decomposed)
209 else:
210 blockdescriptor.general_diagonalize_dc(H_mm, S_mm.copy(), C_mm,
211 eps_M, UL='L')
212 self.timer.stop('General diagonalize')
214 # Make C_nM compatible with the redistributor
215 self.timer.start('Redistribute coefs')
216 if outdescriptor:
217 C2_nM = C_nM
218 else:
219 C2_nM = outdescriptor.empty(dtype=dtype)
220 assert outdescriptor.check(C2_nM)
221 self.mm2nM.redistribute(C_mm, C2_nM, subM, subN) # blocks2cols
222 self.timer.stop('Redistribute coefs')
224 self.timer.start('Send coefs to domains')
225 # eps_M is already on block_comm.rank = 0
226 # easier to broadcast eps_M to all and
227 # get the correct slice afterward.
228 self.block_comm.broadcast(eps_M, 0)
229 eps_n[:] = eps_M[self.bd.get_slice()]
230 self.gd.comm.broadcast(C_nM, 0)
231 self.timer.stop('Send coefs to domains')
233 def distribute_overlap_matrix(self, S_qmM, root=0,
234 add_hermitian_conjugate=False):
235 # Some MPI implementations need a lot of memory to do large
236 # reductions. To avoid trouble, we do comm.sum on smaller blocks
237 # of S (this code is also safe for arrays smaller than blocksize)
238 Sflat_x = S_qmM.ravel()
239 blocksize = 2**23 // Sflat_x.itemsize # 8 MiB
240 nblocks = -(-len(Sflat_x) // blocksize)
241 Mstart = 0
242 self.timer.start('blocked summation')
243 for i in range(nblocks):
244 self.gd.comm.sum(Sflat_x[Mstart:Mstart + blocksize], root=root)
245 Mstart += blocksize
246 assert Mstart + blocksize >= len(Sflat_x)
247 self.timer.stop('blocked summation')
249 xshape = S_qmM.shape[:-2]
250 if len(xshape) == 0:
251 S_qmM = S_qmM[np.newaxis]
252 assert S_qmM.ndim == 3
254 blockdesc = self.mmdescriptor
255 coldesc = self.mM_unique_descriptor
256 S_qmm = blockdesc.zeros(len(S_qmM), S_qmM.dtype)
258 if not coldesc: # XXX ugly way to sort out inactive ranks
259 S_qmM = coldesc.zeros(len(S_qmM), S_qmM.dtype)
261 self.timer.start('Scalapack redistribute')
262 for S_mM, S_mm in zip(S_qmM, S_qmm):
263 self.mM2mm.redistribute(S_mM, S_mm)
264 if add_hermitian_conjugate:
265 if blockdesc.active:
266 pblas_tran(1.0, S_mm.copy(), 1.0, S_mm,
267 blockdesc, blockdesc)
269 if self.libelpa is not None:
270 # Elpa needs the full H_mm, but apparently does not
271 # need the full S_mm. However that fact is not documented,
272 # and hence we stay on the safe side, tri2full-ing the
273 # matrix.
274 scalapack_tri2full(blockdesc, S_mm)
276 self.timer.stop('Scalapack redistribute')
277 return S_qmm.reshape(xshape + blockdesc.shape)
279 def get_overlap_matrix_shape(self):
280 return self.mmdescriptor.shape
282 def calculate_blocked_density_matrix(self, f_n, C_nM):
283 nbands = self.bd.nbands
284 nao = self.nao
285 dtype = C_nM.dtype
287 self.nMdescriptor.checkassert(C_nM)
288 if self.gd.rank == 0:
289 Cf_nM = (C_nM * f_n[:, None])
290 else:
291 C_nM = self.nM_unique_descriptor.zeros(dtype=dtype)
292 Cf_nM = self.nM_unique_descriptor.zeros(dtype=dtype)
294 r = Redistributor(self.block_comm, self.nM_unique_descriptor,
295 self.mmdescriptor)
297 Cf_mm = self.mmdescriptor.zeros(dtype=dtype)
298 r.redistribute(Cf_nM, Cf_mm, nbands, nao)
299 del Cf_nM
301 C_mm = self.mmdescriptor.zeros(dtype=dtype)
302 r.redistribute(C_nM, C_mm, nbands, nao)
303 # no use to delete C_nM as it's in the input...
305 rho_mm = self.mmdescriptor.zeros(dtype=dtype)
307 if 1: # if self.libelpa is None:
308 pblas_simple_gemm(self.mmdescriptor,
309 self.mmdescriptor,
310 self.mmdescriptor,
311 Cf_mm, C_mm, rho_mm, transa='C')
312 else:
313 # elpa_hermitian_multiply was not faster than the ordinary
314 # multiplication in the test. The way we have things distributed,
315 # we need to transpose things at the moment.
316 #
317 # Rather than enabling this, we should store the coefficients
318 # in an appropriate 2D block cyclic format (c_nm) and not the
319 # current C_nM format. This makes it possible to avoid
320 # redistributing the coefficients at all. But we don't have time
321 # to implement this at the moment.
322 mul = self.libelpa.hermitian_multiply
323 desc = self.mmdescriptor
324 from gpaw.utilities.scalapack import pblas_tran
326 def T(array):
327 tmp = array.copy()
328 pblas_tran(alpha=1.0, a_MN=tmp,
329 beta=0.0, c_NM=array,
330 desca=desc, descc=desc)
331 T(C_mm)
332 T(Cf_mm)
333 mul(C_mm, Cf_mm, rho_mm,
334 desc, desc, desc,
335 uplo_a='X', uplo_c='X')
337 return rho_mm
339 def calculate_density_matrix(self, f_n, C_nM, rho_mM=None):
340 """Calculate density matrix from occupations and coefficients.
342 Presently this function performs the usual scalapack 3-step trick:
343 redistribute-numbercrunching-backdistribute.
346 Notes on future performance improvement.
348 As per the current framework, C_nM exists as copies on each
349 domain, i.e. this is not parallel over domains. We'd like to
350 correct this and have an efficient distribution using e.g. the
351 block communicator.
353 The diagonalization routine and other parts of the code should
354 however be changed to accommodate the following scheme:
356 Keep coefficients in C_mm form after the diagonalization.
357 rho_mm can then be directly calculated from C_mm without
358 redistribution, after which we only need to redistribute
359 rho_mm across domains.
361 """
362 dtype = C_nM.dtype
363 rho_mm = self.calculate_blocked_density_matrix(f_n, C_nM)
364 rback = Redistributor(self.block_comm, self.mmdescriptor,
365 self.mM_unique_descriptor)
366 rho1_mM = self.mM_unique_descriptor.zeros(dtype=dtype)
367 rback.redistribute(rho_mm, rho1_mM)
368 del rho_mm
370 if rho_mM is None:
371 if self.gd.rank == 0:
372 rho_mM = rho1_mM
373 else:
374 rho_mM = self.mMdescriptor.zeros(dtype=dtype)
376 self.gd.comm.broadcast(rho_mM, 0)
377 return rho_mM
379 def distribute_to_columns(self, rho_mm, srcdescriptor):
380 redistributor = Redistributor(self.block_comm, # XXX
381 srcdescriptor,
382 self.mM_unique_descriptor)
383 rho_mM = redistributor.redistribute(rho_mm)
384 if self.gd.rank != 0:
385 rho_mM = self.mMdescriptor.zeros(dtype=rho_mm.dtype)
386 self.gd.comm.broadcast(rho_mM, 0)
387 return rho_mM
389 def oldcalculate_density_matrix(self, f_n, C_nM, rho_mM=None):
390 # This version is parallel over the band descriptor only.
391 # This is inefficient, but let's keep it for a while in case
392 # there's trouble with the more efficient version
393 if rho_mM is None:
394 rho_mM = self.mMdescriptor.zeros(dtype=C_nM.dtype)
396 Cf_nM = (C_nM * f_n[:, None]).conj()
397 pblas_simple_gemm(self.nMdescriptor, self.nMdescriptor,
398 self.mMdescriptor, Cf_nM, C_nM, rho_mM, transa='T')
399 return rho_mM
401 def get_transposed_density_matrix(self, f_n, C_nM, rho_mM=None):
402 return self.calculate_density_matrix(f_n, C_nM, rho_mM).conj()
404 def get_description(self):
405 (title, template) = BlacsLayouts.get_description(self)
406 bg = self.blockgrid
407 desc = self.mmdescriptor
408 s = template % (bg.nprow, bg.npcol, desc.mb, desc.nb)
409 if self.libelpa is not None:
410 solver = self.libelpa.description
411 else:
412 solver = 'ScaLAPACK'
413 return ''.join([title, ' / ', solver, ', ', s])
416class OrbitalLayouts(KohnShamLayouts):
417 def __init__(self, gd, bd, block_comm, dtype, nao,
418 timer=nulltimer):
419 KohnShamLayouts.__init__(self, gd, bd, block_comm, dtype,
420 timer)
421 self.mMdescriptor = MatrixDescriptor(nao, nao)
422 self.nMdescriptor = MatrixDescriptor(bd.mynbands, nao)
424 self.Mstart = 0
425 self.Mstop = nao
426 self.Mmax = nao
427 self.mynao = nao
428 self.nao = nao
429 self.orbital_comm = bd.comm
431 def diagonalize(self, H_MM, C_nM, eps_n, S_MM,
432 overwrite_S=False):
433 assert not overwrite_S
434 eps_M = np.empty(C_nM.shape[-1])
435 self.block_comm.broadcast(H_MM, 0)
436 self.block_comm.broadcast(S_MM, 0)
437 # The result on different processor is not necessarily bit-wise
438 # identical, so only domain master performs computation
439 with broadcast_exception(self.gd.comm):
440 if self.gd.comm.rank == 0:
441 self._diagonalize(H_MM, S_MM.copy(), eps_M)
442 self.gd.comm.broadcast(H_MM, 0)
443 self.gd.comm.broadcast(eps_M, 0)
444 eps_n[:] = eps_M[self.bd.get_slice()]
445 C_nM[:] = H_MM[self.bd.get_slice()]
447 def _diagonalize(self, H_MM, S_MM, eps_M):
448 """Serial diagonalize via LAPACK."""
449 # This is replicated computation but ultimately avoids
450 # additional communication
451 if eps_M.size == 0:
452 return
453 eps_M[:], H_MM.T[:] = eigh(H_MM, S_MM,
454 overwrite_a=True,
455 check_finite=debug)
456 if H_MM.dtype == complex:
457 np.negative(H_MM.imag, H_MM.imag)
459 def estimate_memory(self, mem, dtype):
460 itemsize = mem.itemsize[dtype]
461 mem.subnode('eps [M]', self.nao * mem.floatsize)
462 mem.subnode('H [MM]', self.nao * self.nao * itemsize)
464 def distribute_overlap_matrix(self, S_qMM, root=0,
465 add_hermitian_conjugate=False):
466 self.gd.comm.sum(S_qMM, root)
467 if add_hermitian_conjugate:
468 S_qMM += S_qMM.swapaxes(-1, -2).conj()
469 return S_qMM
471 def get_overlap_matrix_shape(self):
472 return self.nao, self.nao
474 def calculate_density_matrix(self, f_n, C_nM, rho_MM=None, C2_nM=None):
475 # Only a madman would use a non-transposed density matrix.
476 # Maybe we should use the get_transposed_density_matrix instead
477 if rho_MM is None:
478 rho_MM = np.zeros((self.mynao, self.nao), dtype=C_nM.dtype)
479 # XXX Should not conjugate, but call gemm(..., 'c')
480 # Although that requires knowing C_Mn and not C_nM.
481 # that also conforms better to the usual conventions in literature
483 # Use only occupied bands to construct density matrix
484 occupied = abs(f_n) > 1.0e-10
485 if C2_nM is None:
486 C2_nM = C_nM
487 Cf_Mn = \
488 np.ascontiguousarray(C2_nM[occupied].T.conj() * f_n[occupied])
489 # gemm(1.0, C_nM[occupied], Cf_Mn, 0.0, rho_MM, 'n')
490 mmm(1.0, Cf_Mn, 'N', C_nM[occupied], 'N', 0.0, rho_MM)
491 return rho_MM
493 def get_transposed_density_matrix(self, f_n, C_nM, rho_MM=None):
494 return self.calculate_density_matrix(f_n, C_nM, rho_MM).T.copy()
496 # if rho_MM is None:
497 # rho_MM = np.zeros((self.mynao, self.nao), dtype=C_nM.dtype)
498 # C_Mn = C_nM.T.copy()
499 # gemm(1.0, C_Mn, f_n[np.newaxis, :] * C_Mn, 0.0, rho_MM, 'c')
500 # self.bd.comm.sum(rho_MM)
501 # return rho_MM
503 def alternative_calculate_density_matrix(self, f_n, C_nM, rho_MM=None):
504 if rho_MM is None:
505 rho_MM = np.zeros((self.mynao, self.nao), dtype=C_nM.dtype)
506 # Alternative suggestion. Might be faster. Someone should test this
507 C_Mn = C_nM.T.copy()
508 r2k(0.5, C_Mn, f_n * C_Mn, 0.0, rho_MM)
509 tri2full(rho_MM)
510 return rho_MM
512 def get_description(self):
513 return 'Serial LAPACK'
515 def calculate_density_matrix_delta(self, d_nn, C_nM, rho_MM=None):
516 # Only a madman would use a non-transposed density matrix.
517 # Maybe we should use the get_transposed_density_matrix instead
518 if rho_MM is None:
519 rho_MM = np.zeros((self.mynao, self.nao), dtype=C_nM.dtype)
520 Cd_Mn = np.zeros((self.nao, self.bd.mynbands), dtype=C_nM.dtype)
521 # XXX Should not conjugate, but call gemm(..., 'c')
522 # Although that requires knowing C_Mn and not C_nM.
523 # that also conforms better to the usual conventions in literature
524 C_Mn = C_nM.T.conj().copy()
525 # gemm(1.0, d_nn, C_Mn, 0.0, Cd_Mn, 'n')
526 # gemm(1.0, C_nM, Cd_Mn, 0.0, rho_MM, 'n')
527 mmm(1.0, C_Mn, 'N', d_nn, 'N', 0.0, Cd_Mn)
528 mmm(1.0, Cd_Mn, 'N', C_nM, 'N', 0.0, rho_MM)
529 self.bd.comm.sum(rho_MM)
530 return rho_MM
532 def get_transposed_density_matrix_delta(self, d_nn, C_nM, rho_MM=None):
533 return self.calculate_density_matrix_delta(d_nn, C_nM, rho_MM).T.copy()