Coverage for gpaw/utilities/timing.py: 29%

249 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2025-07-19 00:19 +0000

1# Copyright (C) 2003 CAMP 

2# Please see the accompanying LICENSE file for further information. 

3 

4import sys 

5import time 

6import math 

7 

8import numpy as np 

9from ase.utils.timing import Timer 

10 

11import gpaw.mpi as mpi 

12 

13 

14class NullTimer: 

15 """Compatible with Timer and StepTimer interfaces. Does nothing.""" 

16 def __init__(self): 

17 pass 

18 

19 def print_info(self, calc): 

20 pass 

21 

22 def start(self, name): 

23 pass 

24 

25 def stop(self, name=None): 

26 pass 

27 

28 def get_time(self, name): 

29 return 0.0 

30 

31 def write(self, out=sys.stdout): 

32 pass 

33 

34 def write_now(self, mark=''): 

35 pass 

36 

37 def add(self, timer): 

38 pass 

39 

40 def __call__(self, name): 

41 return self 

42 

43 def __enter__(self): 

44 pass 

45 

46 def __exit__(self, *args): 

47 pass 

48 

49 

50nulltimer = NullTimer() 

51 

52 

53class DebugTimer(Timer): 

54 def __init__(self, print_levels=1000, comm=mpi.world, txt=sys.stdout): 

55 Timer.__init__(self, print_levels) 

56 ndigits = 1 + int(math.log10(comm.size)) 

57 self.srank = '%0*d' % (ndigits, comm.rank) 

58 self.txt = txt 

59 

60 def start(self, name): 

61 Timer.start(self, name) 

62 abstime = time.time() 

63 t = self.timers[tuple(self.running)] + abstime 

64 self.txt.write('T%s >> %15.8f %s (%7.5fs) started\n' 

65 % (self.srank, abstime, name, t)) 

66 

67 def stop(self, name=None): 

68 if name is None: 

69 name = self.running[-1] 

70 abstime = time.time() 

71 t = self.timers[tuple(self.running)] + abstime 

72 self.txt.write('T%s << %15.8f %s (%7.5fs) stopped\n' 

73 % (self.srank, abstime, name, t)) 

74 Timer.stop(self, name) 

75 

76 

77class GPUEvent: 

78 def __init__(self, name): 

79 self.name = name 

80 import cupy 

81 default = dict(block=False, 

82 disable_timing=False, 

83 interprocess=False) 

84 self.stop_event = cupy.cuda.Event(**default) 

85 self.start_event = cupy.cuda.Event(**default) 

86 self.start_event.record() 

87 

88 def stop(self): 

89 self.stop_event.record() 

90 

91 def get_time(self): 

92 import cupy 

93 return cupy.cuda.get_elapsed_time(self.start_event, 

94 self.stop_event) / 1000 

95 

96 

97class GPUTimerBase: 

98 

99 def __init__(self, max_stack=10): 

100 self.event_queue = [] 

101 self.event_stack = [] 

102 self.max_stack = max_stack 

103 from collections import defaultdict 

104 self.gpu_timers = defaultdict(float) 

105 

106 def gpu_start(self, key): 

107 self.event_stack.append(GPUEvent(key)) 

108 

109 def gpu_stop(self): 

110 gpu_event = self.event_stack.pop() 

111 gpu_event.stop() 

112 self.event_queue.append(gpu_event) 

113 if len(self.event_queue) > self.max_stack: 

114 self.handle_events() 

115 

116 def handle_events(self): 

117 while len(self.event_queue): 

118 event = self.event_queue[0] 

119 if not event.stop_event.done: 

120 break 

121 del self.event_queue[0] 

122 time = event.get_time() 

123 self.gpu_timers[event.name] += time 

124 self.handle_event_hook(event) 

125 

126 def handle_event_hook(self, event): 

127 pass 

128 

129 def gpu_write(self, out=sys.stdout): 

130 import cupy 

131 event = cupy.cuda.Event(block=True) 

132 event.synchronize() 

133 self.handle_events() 

134 

135 timers, self.timers = self.timers, self.gpu_timers 

136 Timer.write(self, out) 

137 self.timers = timers 

138 

139 

140class GPUTimer(Timer, GPUTimerBase): 

141 def __init__(self, *args, **kwargs): 

142 Timer.__init__(self, *args, **kwargs) 

143 GPUTimerBase.__init__(self) 

144 

145 def start(self, name): 

146 Timer.start(self, name) 

147 GPUTimerBase.gpu_start(self, name) 

148 

149 def stop(self, name=None): 

150 Timer.stop(self, name) 

151 GPUTimerBase.gpu_stop(self) 

152 

153 def write(self, out=sys.stdout): 

154 print('CPU event timings:', file=out) 

155 Timer.write(self, out) 

156 print('GPU event timings:', file=out) 

157 GPUTimerBase.gpu_write(self, out) 

158 

159 

160def ranktxt(comm, rank=None): 

161 rank = comm.rank if rank is None else rank 

162 ndigits = len(str(comm.size - 1)) 

163 return '%0*d' % (ndigits, rank) 

164 

165 

166class ParallelTimer(DebugTimer): 

167 """Like DebugTimer but writes timings from all ranks. 

168 

169 Each rank writes to timings.<rank>.txt. Also timings.metadata.txt 

170 will contain information about the parallelization layout. The idea 

171 is that the output from this timer can be used for plots and to 

172 determine bottlenecks in the parallelization. 

173 

174 See the tool gpaw-plot-parallel-timings.""" 

175 def __init__(self, prefix='timings', flush=False): 

176 fname = f'{prefix}.{ranktxt(mpi.world)}.txt' 

177 txt = open(fname, 'w', buffering=1 if flush else -1) 

178 DebugTimer.__init__(self, comm=mpi.world, txt=txt) 

179 self.prefix = prefix 

180 

181 def print_info(self, calc): 

182 """Print information about parallelization into a file.""" 

183 fd = open('%s.metadata.txt' % self.prefix, 'w') 

184 DebugTimer.print_info(self, calc) 

185 wfs = calc.wfs 

186 

187 # We won't have to type a lot if everyone just sends all their numbers. 

188 myranks = np.array([wfs.world.rank, wfs.kd.comm.rank, 

189 wfs.bd.comm.rank, wfs.gd.comm.rank]) 

190 allranks = None 

191 if wfs.world.rank == 0: 

192 allranks = np.empty(wfs.world.size * 4, dtype=int) 

193 wfs.world.gather(myranks, 0, allranks) 

194 if wfs.world.rank == 0: 

195 for itsranks in allranks.reshape(-1, 4): 

196 fd.write('r=%d k=%d b=%d d=%d\n' % tuple(itsranks)) 

197 fd.close() 

198 

199 

200class Profiler(Timer): 

201 def __init__(self, prefix, comm=mpi.world): 

202 import atexit 

203 

204 self.prefix = prefix 

205 self.comm = comm 

206 self.ranktxt = ranktxt(comm) 

207 fname = f'{prefix}.{self.ranktxt}.json' 

208 self.txt = open(fname, 'w', buffering=-1) 

209 self.pid = 0 # os.getpid() creates more confusing output 

210 atexit.register(Profiler.finish_trace, self) 

211 # legacy json format for perfetto always assumes microseconds 

212 self.u = 1_000_000 

213 

214 self.synchronize() 

215 Timer.__init__(self, 1000) 

216 

217 def synchronize(self): 

218 # Synchronize in order to have same time reference 

219 ref = np.zeros(1) 

220 if self.comm.rank == 0: 

221 ref[0] = time.time() 

222 self.comm.broadcast(ref, 0) 

223 self.ref = ref[0] 

224 

225 def finish_trace(self): 

226 self.txt.close() 

227 self.comm.barrier() 

228 if self.comm.rank == 0: 

229 out = open(self.prefix + '.json', 'w') 

230 out.write("""{ 

231 "traceEvents": 

232 [ """) 

233 for i in range(self.comm.size): 

234 fname = f'{self.prefix}.{ranktxt(self.comm, rank=i)}.json' 

235 print('Processing', fname) 

236 with open(fname, 'r') as f: 

237 out.writelines(f.readlines()) 

238 out.write("] }\n") 

239 out.close() 

240 self.comm.barrier() 

241 

242 def start(self, name): 

243 Timer.start(self, name) 

244 self.txt.write( 

245 f"""{{"name": "{name}", "cat": "PERF", "ph": "B",""" 

246 f""" "pid": {self.pid}, "tid": {self.ranktxt}, """ 

247 f""""ts": {int((time.time() - self.ref) * self.u)} }},\n""") 

248 

249 def stop(self, name=None): 

250 if name is None: 

251 name = self.running[-1] 

252 self.txt.write( 

253 f"""{{"name": "{name}", "cat": "PERF", "ph": "E", """ 

254 f""""pid": {self.pid}, "tid": {self.ranktxt}, """ 

255 f""""ts": {int((time.time() - self.ref) * self.u)}}},\n""") 

256 Timer.stop(self, name) 

257 

258 

259class GPUProfiler(Profiler, GPUTimerBase): 

260 def __init__(self, prefix, comm=mpi.world): 

261 Profiler.__init__(self, prefix, comm=comm) 

262 GPUTimerBase.__init__(self) 

263 

264 def synchronize(self): 

265 from cupy.cuda import Event 

266 # Make sure GPU gets here 

267 event = Event(block=True) 

268 event.record() 

269 event.synchronize() 

270 

271 # Wait all CPUs 

272 self.comm.barrier() 

273 

274 # Now all GPUs and CPUs are somewhat simultaneous 

275 # So, record the reference event and time 

276 event = Event(block=True) 

277 event.record() 

278 self.ref = time.time() 

279 self.ref_event = event 

280 

281 # Broadcast CPU time reference (possibly problematic) 

282 buf = np.zeros(1) 

283 buf[0] = self.ref 

284 self.comm.broadcast(buf, 0) 

285 self.ref = buf[0] 

286 

287 def start(self, name, gpu=False): 

288 Profiler.start(self, name) 

289 if gpu: 

290 GPUTimerBase.gpu_start(self, name) 

291 

292 def stop(self, name=None, gpu=False): 

293 Profiler.stop(self, name) 

294 if gpu: 

295 GPUTimerBase.gpu_stop(self) 

296 

297 def handle_event_hook(self, event): 

298 import cupy 

299 

300 def get_time(e): 

301 t = cupy.cuda.get_elapsed_time 

302 return t(self.ref_event, e) 

303 

304 ms_start = get_time(event.start_event) 

305 ms_stop = get_time(event.stop_event) 

306 self.txt.write( 

307 f"""{{"name": "{event.name}", "cat": "PERF", "ph": "B",""" 

308 f""" "pid": {self.pid}, "tid": "GPU {self.ranktxt}", """ 

309 f""""ts": {int(ms_start * 1000)} }},\n""") 

310 self.txt.write( 

311 f"""{{"name": "{event.name}", "cat": "PERF", "ph": "E", """ 

312 f""""pid": {self.pid}, "tid": "GPU {self.ranktxt}", """ 

313 f""""ts": {int(ms_stop * 1000)} }},\n""") 

314 

315 

316class HPMTimer(Timer): 

317 """HPMTimer requires installation of the IBM BlueGene/P HPM 

318 middleware interface to the low-level UPC library. This will 

319 most likely only work at ANL's BlueGene/P. Must compile 

320 with GPAW_HPM macro in customize.py. Note that HPM_Init 

321 and HPM_Finalize are called in cgpaw.c and not in the Python 

322 interface. Timer must be called on all ranks in node, otherwise 

323 HPM will hang. Hence, we only call HPM_start/stop on a list 

324 subset of timers.""" 

325 

326 top_level = 'GPAW.calculator' # HPM needs top level timer 

327 compatible = ['Initialization', 'SCF-cycle'] 

328 

329 def __init__(self): 

330 Timer.__init__(self) 

331 from gpaw.cgpaw import hpm_start, hpm_stop 

332 self.hpm_start = hpm_start 

333 self.hpm_stop = hpm_stop 

334 hpm_start(self.top_level) 

335 

336 def start(self, name): 

337 Timer.start(self, name) 

338 if name in self.compatible: 

339 self.hpm_start(name) 

340 

341 def stop(self, name=None): 

342 Timer.stop(self, name) 

343 if name in self.compatible: 

344 self.hpm_stop(name) 

345 

346 def write(self, out=sys.stdout): 

347 Timer.write(self, out) 

348 self.hpm_stop(self.top_level) 

349 

350 

351class CrayPAT_timer(Timer): 

352 """Interface to CrayPAT API. In addition to regular timers, 

353 the corresponding regions are profiled by CrayPAT. The gpaw-python has 

354 to be compiled under CrayPAT. 

355 """ 

356 

357 def __init__(self, print_levels=4): 

358 Timer.__init__(self, print_levels) 

359 from gpaw.cgpaw import craypat_region_begin, craypat_region_end 

360 self.craypat_region_begin = craypat_region_begin 

361 self.craypat_region_end = craypat_region_end 

362 self.regions = {} 

363 self.region_id = 5 # leave room for regions in C 

364 

365 def start(self, name): 

366 Timer.start(self, name) 

367 if name in self.regions: 

368 id = self.regions[name] 

369 else: 

370 id = self.region_id 

371 self.regions[name] = id 

372 self.region_id += 1 

373 self.craypat_region_begin(id, name) 

374 

375 def stop(self, name=None): 

376 Timer.stop(self, name) 

377 id = self.regions[name] 

378 self.craypat_region_end(id)