datalad-handbook/tools/link_consolidator.py

153 lines
4.6 KiB
Python

#!/usr/bin/env python
#
# Use this with the output of the sphinx linkchecker
# $ make linkcheck | tee links.txt
# $ python tools/link_consolidator.py links.txt
#
# It provides a markdown-formatted list of issues, that can be posted
# to an issue tracker.
#
# The following issues are recognized:
#
# - `redundant-trailing-slash`
# The URL has a trailing slash that is (likely) not needed.
# (was: The same URL, but without the trailing slash, is used elsewhere
# in the book).
# - unshorted-youtube-link
# This could be a https://youto.be/ style short URL
# - needless-trailing-slash
# A URL without a path component. It should need nbow trailing slash
# - permanent-redirect
# A permanent redirect is reported for a URL
# - none-https
# An link with an insecure protocol
# - needless-latest-handbook
# The default handbook version served is 'en/latest'
# - long-kbi-link
# KBI link with needless '/index.html' tail
from pathlib import Path
import re
import sys
from urllib.parse import urlparse
linkline_re = re.compile(
'\((?P<use>.*): line[ ]+(?P<line>[0-9]+)\) (?P<status>[^ ]+) [ ]*(?P<link>[^ ]+)(?P<note>.*)'
)
def read_links(fpath):
links = {}
for line in fpath.open():
m = linkline_re.match(line)
if m is None:
print(f'ignore line: {line!r}', file=sys.stderr)
continue
m = m.groupdict()
link = m['link'].strip()
link_rec = links.get(link, {})
if not link_rec:
link_rec['status'] = m['status']
link_rec['note'] = m['note'].strip() if m['note'].strip() else None
uses = link_rec.get('uses')
if uses is None:
use = {}
uses = [use]
else:
# must be a list
use = {}
uses.append(use)
use.update(
file=m['use'],
line=int(m['line']),
)
link_rec['uses'] = uses
links[link] = link_rec
return links
def _report_flaws(report, type_, uses, src):
for u in uses:
flaws = report.get(u['file'], [])
flaws.append(dict(
type=type_,
line=u['line'],
src=src,
))
report[u['file']] = flaws
def report_redundant_trailing_slash(links, report):
for link in links:
if link.endswith('/'):
_report_flaws(
report, "redundant-trailing-slash", links[link]['uses'], link)
def report_unshortened_youtube(links, report):
for link in links:
if 'youtube.com/watch' in link:
_report_flaws(
report, "unshorted-youtube-link", links[link]['uses'], link)
def report_domainroot_trailing_slash(links, report):
for link in links:
p = urlparse(link)
if p.path == '/' and not p.params and not p.query and not p.fragment:
_report_flaws(
report, "needless-trailing-slash", links[link]['uses'], link)
def report_permanent_redirects(links, report):
for link, rec in links.items():
if rec['status'] == 'redirect' and 'permanently to' in rec['note']:
_report_flaws(
report, "permanent-redirect", links[link]['uses'],
f"{link} {rec['note']}")
def report_insecure_links(links, report):
for link in links:
if link.startswith('http://'):
_report_flaws(
report, "none-https", links[link]['uses'], link)
def report_needless_latest_handbook(links, report):
for link in links:
if 'handbook.datalad.org/en/latest/' in link:
_report_flaws(
report, "needless-latest-handbook", links[link]['uses'], link)
def report_long_kbi_link(links, report):
for link in links:
if 'knowledge-base.psychoinformatics.de/kbi' in link \
and link.endswith('index.html'):
_report_flaws(
report, "long-kbi-link", links[link]['uses'], link)
def print_report(report):
for f in sorted(report):
print(f'- `docs/{f.strip()}.rst`:')
for flaw in sorted(report[f], key=lambda x: x['line']):
print(f" - *line {flaw['line']}*: `{flaw['type']}` [{flaw['src']}]")
if __name__ == '__main__':
links = read_links(Path(sys.argv[1]))
report = {}
report_redundant_trailing_slash(links, report)
report_unshortened_youtube(links, report)
report_domainroot_trailing_slash(links, report)
report_permanent_redirects(links, report)
report_insecure_links(links, report)
report_needless_latest_handbook(links, report)
report_long_kbi_link(links, report)
print_report(report)
#for l in sorted(links):
# print(l)