=== Do not change lines below ===
{
"chain": [],
"cmd": "wget -O code/project.py a70b93a69c/project.py",
"dsid": "cd57f37d-f8f2-4572-8dbe-cbc4862134a4",
"exit": 0,
"extra_inputs": [],
"inputs": [],
"outputs": [],
"pwd": "."
}
^^^ Do not change lines above ^^^
223 lines
7.3 KiB
Python
223 lines
7.3 KiB
Python
"""Process project records into project pages
|
|
|
|
Needs the associated_with property on a project to be inlined to
|
|
correctly assign contributors and sites. Needs the (inverse of)
|
|
influenced_by field from the umbrella project to be joined to the
|
|
record (as "x_had_influence") to correctly assign roles.
|
|
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
import re
|
|
import textwrap
|
|
|
|
import click
|
|
import yaml
|
|
|
|
SITE_DICT = {
|
|
"ror:04xfq0f34": "aachen", # RWTH Aachen
|
|
"ror:04cvxnb49": "frankfurt", # Goethe University Frankfurt
|
|
"ror:038t36y30": "heidelberg", # Heidelberg University
|
|
"ror:02nv7yv05": "juelich", # Forschungszentrum Jülich
|
|
"ror:023b0x485": "mainz", # Johannes Gutenberg University Mainz
|
|
"ror:01hynnt93": "mannheim", # CIMH Mannheim
|
|
"ror:00fbnyb24": "wuerzburg", # University of Würzburg
|
|
}
|
|
|
|
|
|
def format_title(d: dict) -> str | None:
|
|
"""Return formatted title
|
|
|
|
Combines short name and title if possible. Makes do with what's
|
|
available otherwise.
|
|
|
|
"""
|
|
if "short_name" in d and "title" in d:
|
|
res = f"{d['short_name']}: {d['title']}"
|
|
elif "title" in d:
|
|
res = d["title"]
|
|
elif "short_name" in d:
|
|
res = d["short_name"]
|
|
else:
|
|
res = None
|
|
return res
|
|
|
|
|
|
def get_contributors_and_sites(d: dict) -> tuple[list[str], list[str]]:
|
|
"""Go through attributions, return contributor and site labels
|
|
|
|
Only contributors with TRR root PID and known sites are returned.
|
|
|
|
"""
|
|
ctb_pat = re.compile(r"trr379root:contributors/([\w\-]+)")
|
|
contributors = set()
|
|
sites = set()
|
|
for association in d.get("associated_with", []):
|
|
obj = association.get("object")
|
|
roles = set(x if isinstance(x, str) else x.get("pid") for x in association.get("roles", []))
|
|
if not isinstance(obj, dict):
|
|
# silently demand inlined input so we can check schema type
|
|
continue
|
|
|
|
if obj.get("schema_type") == "trr379ri:TRR379Organization" and "marcrel:sht" in roles:
|
|
# associated_with(org, supporting host) -> site
|
|
if (site_label := SITE_DICT.get(obj["pid"])) is not None:
|
|
sites.add(site_label)
|
|
elif obj.get("schema_type") == "trr379ri:TRR379Person":
|
|
# associated_with(person, any) -> contributor
|
|
if (m := re.match(ctb_pat, obj["pid"])) is not None:
|
|
contributors.add(m.group(1))
|
|
|
|
return sorted(list(contributors)), sorted(list(sites))
|
|
|
|
|
|
def get_topics(d: dict) -> list[str]:
|
|
topic_pat = re.compile(r"trr379root:topics/([\w\-]+)")
|
|
topics = []
|
|
for x in d.get("about", []):
|
|
x_pid = x if isinstance(x, str) else x.get("pid")
|
|
if (m := re.match(topic_pat, x_pid)) is not None:
|
|
topics.append(m.group(1))
|
|
return topics
|
|
|
|
|
|
def get_roles(d: dict) -> list[str]:
|
|
"""Get roles of the project
|
|
|
|
A role of the project is defined by its influence on the TRR
|
|
umbrella project. For this reason, we rely on the association
|
|
class in the umbrella project's "influenced_by" to have been
|
|
processed. We will assume that the result was appended as
|
|
"x_had_influence" property on any given project.
|
|
|
|
"""
|
|
pat = re.compile(r"trr379root:roles/([\w\-]+)")
|
|
roles = set()
|
|
for influence in d.get("x_had_influence", []):
|
|
# we could check subject of that influence, but we accept any
|
|
for role in influence.get("roles", []):
|
|
role_pid = role if isinstance(role, str) else role.get("pid")
|
|
if (m := re.match(pat, role_pid)) is not None:
|
|
roles.add(m.group(1))
|
|
return sorted(list(roles))
|
|
|
|
|
|
def get_weight(d: dict) -> int | None:
|
|
"""Return sorting weight, following existing convention"""
|
|
pat = re.compile(r"trr379root:projects/([a-cq])(\d+)")
|
|
major_weights = {"a": 1000, "b": 2000, "c": 3000, "q": 4000}
|
|
if (m := re.match(pat, d["pid"])) is not None:
|
|
return major_weights[m.group(1)] + (int(m.group(2)) - 1) * 10
|
|
|
|
|
|
def read_markdown_content(p: Path) -> tuple[dict | None, str | None]:
|
|
"""Read content from a markdown page with a yaml front matter"""
|
|
with p.open() as fp:
|
|
lines = fp.readlines()
|
|
fence_loc = [i for i, line in enumerate(lines) if line.rstrip() == "---"]
|
|
|
|
if len(fence_loc) >= 2:
|
|
header_str = "".join(lines[fence_loc[0]+1:fence_loc[1]])
|
|
content = "".join(lines[fence_loc[1]+1:])
|
|
try:
|
|
header = yaml.safe_load(header_str)
|
|
except yaml.YAMLError:
|
|
header = None
|
|
else:
|
|
header = None
|
|
content = None
|
|
|
|
return header, content
|
|
|
|
|
|
def write_page(p: Path, fm: dict, content: str | None, sep=True):
|
|
"""Write a markdown page with yaml front matter"""
|
|
|
|
with p.open("wt") as fp:
|
|
fp.write("---\n")
|
|
yaml.dump(fm, stream=fp, allow_unicode=True, sort_keys=False)
|
|
fp.write("---\n")
|
|
if content is not None:
|
|
if sep:
|
|
fp.write("\n")
|
|
fp.write(content)
|
|
|
|
|
|
@click.command()
|
|
@click.argument("input", type=click.File("rb"))
|
|
@click.argument(
|
|
"outdir",
|
|
type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
|
|
)
|
|
def main(input, outdir):
|
|
|
|
pat = re.compile(r"(https://trr379\.de/|trr379root:)projects/([\w\-]+)")
|
|
|
|
for line in input:
|
|
project = json.loads(line)
|
|
|
|
if (m := re.match(pat, project["pid"])) is not None:
|
|
label = m.group(2)
|
|
else:
|
|
# stick to trr379root:projects/ namespace (excludes root project)
|
|
continue
|
|
|
|
front_matter_dict = {}
|
|
|
|
title = format_title(project)
|
|
if title is not None:
|
|
front_matter_dict["title"] = title
|
|
|
|
contributors, sites = get_contributors_and_sites(project)
|
|
if len(contributors) > 0:
|
|
front_matter_dict["contributors"] = contributors
|
|
if len(sites) > 0:
|
|
front_matter_dict["sites"] = sites
|
|
|
|
topics = get_topics(project)
|
|
if len(topics) > 0:
|
|
front_matter_dict["topics"] = topics
|
|
|
|
roles = get_roles(project)
|
|
if len(roles) > 0:
|
|
front_matter_dict["roles"] = roles
|
|
|
|
weight = get_weight(project)
|
|
if weight is not None:
|
|
front_matter_dict["weight"] = weight
|
|
|
|
description = project.get("description")
|
|
|
|
# prepare for writing markdown page(s)
|
|
out_file = outdir / label / "_index.md"
|
|
if not out_file.parent.is_dir():
|
|
out_file.parent.mkdir()
|
|
|
|
# write default (English) page
|
|
write_page(
|
|
out_file,
|
|
front_matter_dict,
|
|
(
|
|
textwrap.fill(description, width=80, break_long_words=False)
|
|
if description is not None
|
|
else None
|
|
),
|
|
)
|
|
|
|
# for German, keep description and title from the old file, update header
|
|
# (until internationalization is solved in the Pool)
|
|
out_file_de = out_file.with_suffix(".de.md")
|
|
if out_file_de.exists():
|
|
old_header_de, old_content_de, = read_markdown_content(out_file_de)
|
|
else:
|
|
old_header_de = old_content_de = None
|
|
|
|
new_header_de = front_matter_dict.copy()
|
|
if old_header_de is not None and "title" in old_header_de:
|
|
new_header_de["title"] = old_header_de["title"]
|
|
write_page(out_file_de, new_header_de, old_content_de, sep=False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|