Coverage for gpaw/utilities/urlcheck.py: 27%
41 statements
« prev ^ index » next coverage.py v7.7.1, created at 2025-07-09 00:21 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2025-07-09 00:21 +0000
1"""Check URL's in Python files."""
2import re
3import sys
4from pathlib import Path
5from urllib.error import HTTPError, URLError
6from urllib.request import urlopen, Request
8OK = {'https://doi.org/%s',
9 'https://arxiv.org/abs/%s',
10 'https://gitlab.com/gpaw/gpaw/-/merge_requests/%s',
11 'https://gitlab.com/gpaw/gpaw/-/issues/%s',
12 'https://xkcd.com/%s',
13 'https://gitlab.com/ase/ase.git@master',
14 'https://gitlab.com/{name}/{name}.git',
15 'https://cmrdb.fysik.dtu.dk/c2db',
16 'https://wiki.fysik.dtu.dk/gpaw-files/gpaw-setups-*.tar.gz',
17 'https://wiki.fysik.dtu.dk/gpaw-files',
18 'https://wiki.fysik.dtu.dk/gpaw-files/',
19 'https://wiki.fysik.dtu.dk/gpaw-files/things/',
20 'https://gpaw.readthedocs.io/devel'}
22USERAGENT = 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'
25def check(root: Path) -> int:
26 """Chech URL's in Python files inside root."""
27 errors = 0
28 for path in root.glob('**/*.py'):
29 for n, line in enumerate(path.read_text().splitlines()):
30 for url in re.findall(r'https?://\S+', line):
31 url = url.rstrip(""",.'"}):""")
32 if url not in OK and 'html/_downloads' not in str(path):
33 if '(' in url and ')' not in url:
34 url += ')'
35 if not check1(path, n, url):
36 errors += 1
37 return errors
40def check1(path: Path, n: int, url: str) -> bool:
41 try:
42 req = Request(url, headers={'User-Agent': USERAGENT})
43 urlopen(req)
44 except (HTTPError, URLError, ConnectionResetError) as e:
45 print(f'{path}:{n + 1}')
46 print(url)
47 print(e)
48 print()
49 return False
50 except Exception:
51 print(f'{path}:{n + 1}')
52 print(url)
53 raise
54 return True
57def test():
58 errors = sum(check(Path(f)) for f in ['gpaw', 'doc'])
59 assert errors < 10
62if __name__ == '__main__':
63 for arg in sys.argv[1:]:
64 root = Path(arg)
65 check(root)