From 9a85e728bec55259f0b7709632c8b8016cef4fda Mon Sep 17 00:00:00 2001 From: Stephan Heunis Date: Tue, 31 Mar 2026 11:17:19 +0200 Subject: [PATCH 1/2] script to extract depiction distribution urls from personrecords --- .forgejo/tools/get-person-depiction-urls.py | 79 +++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 .forgejo/tools/get-person-depiction-urls.py diff --git a/.forgejo/tools/get-person-depiction-urls.py b/.forgejo/tools/get-person-depiction-urls.py new file mode 100644 index 0000000..c8f7137 --- /dev/null +++ b/.forgejo/tools/get-person-depiction-urls.py @@ -0,0 +1,79 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "rich-click", +# ] +# /// +"""Generate person-depiction-distribution-urls +""" +import sys +import json +from urllib.parse import urlparse, unquote +from pathlib import Path +import rich_click as click + +def get_extension(url): + """ + Extract the file extension from a URL. + Ignores query parameters and fragments. + """ + path = urlparse(url).path + return Path(unquote(path)).suffix.lstrip('.') + +def is_person(record, target_class): + stype = record.get('schema_type', None) + if stype == target_class: + return True + return False + +@click.command() +@click.option('--target-class', '-t', default='xyzri:XYZPublication') +@click.option('--depiction-type', '-d', default='xyzrins:depiction-types/e9a34f7d-d05e-4591-bb45-f8a0c499e07b') +def main( + target_class: str = 'xyzri:XYZPerson', + depiction_type: str = 'xyzrins:depiction-types/e9a34f7d-d05e-4591-bb45-f8a0c499e07b', +) -> None: + """ +- Takes lines of json, containing a Person record, as input +- The Person record is assumed to have a specific structure with regards to inlined + objects at specific fields: Person -> depictions -> distributions -> characterized_by + (this structure can be constructed, for example, with the use of dtc + qri + jq) +- It will then extract download URLs for each depiction of the person record, provided + the depiction has the 'kind' specified by the depiction-type argument +- For each depiction distribution, it will output: the person curie, the file extension + and the url to stdout + """ + for line in sys.stdin: + line = line.strip() + if not line: + continue + try: + person = json.loads(line) + except json.JSONDecodeError: + # skip invalid JSON + continue + + if not is_person(person, target_class): + continue + + pid = person.get('pid') + if not pid: + continue + curie_ref = pid.split(':', 1)[-1] + depictions = person.get('depictions', []) + for dep in depictions: + if dep.get('kind') != depiction_type: + continue + distributions = dep.get('distributions', []) + for dist in distributions: + for char in dist.get('characterized_by', []): + if char.get('predicate') != 'dcat:downloadUrl': + continue + url = char.get('object') + if not url: + continue + ext = get_extension(url) + print(f'{curie_ref}\t{ext}\t{url}') + +if __name__ == '__main__': + main() \ No newline at end of file -- 2.52.0 From df255ab06bf725d7ddbcc40a9d3ec9e4f43b8ce4 Mon Sep 17 00:00:00 2001 From: Stephan Heunis Date: Tue, 31 Mar 2026 11:35:34 +0200 Subject: [PATCH 2/2] fix target class default --- .forgejo/tools/get-person-depiction-urls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 .forgejo/tools/get-person-depiction-urls.py diff --git a/.forgejo/tools/get-person-depiction-urls.py b/.forgejo/tools/get-person-depiction-urls.py old mode 100644 new mode 100755 index c8f7137..ae429a0 --- a/.forgejo/tools/get-person-depiction-urls.py +++ b/.forgejo/tools/get-person-depiction-urls.py @@ -27,7 +27,7 @@ def is_person(record, target_class): return False @click.command() -@click.option('--target-class', '-t', default='xyzri:XYZPublication') +@click.option('--target-class', '-t', default='xyzri:XYZPerson') @click.option('--depiction-type', '-d', default='xyzrins:depiction-types/e9a34f7d-d05e-4591-bb45-f8a0c499e07b') def main( target_class: str = 'xyzri:XYZPerson', -- 2.52.0