knowledge-enrichment/.forgejo/tools/get-depiction-urls.py

# /// script
# requires-python = ">=3.12"
# dependencies = [
#   "rich-click",
# ]
# ///
"""Generate record-depiction-distribution-urls
"""
import sys
import json
from urllib.parse import urlparse, unquote
from pathlib import Path
import rich_click as click

RECORD_SPEC = ['xyzri:XYZPerson', 'xyzri:XYZInstrument']
DEPICTION_SPEC = {
    'logo': 'schema:logo',
    'portrait': 'xyzrins:depiction-types/e9a34f7d-d05e-4591-bb45-f8a0c499e07b',
}
DEPICTION_PIDS = dict((v,k) for k,v in DEPICTION_SPEC.items())
DEPICTION_PID_LIST = list(DEPICTION_SPEC.values())

def get_extension(url):
    """
    Extract the file extension from a URL.
    Ignores query parameters and fragments.
    """
    path = urlparse(url).path
    return Path(unquote(path)).suffix.lstrip('.')

@click.command()
@click.option('--target-class', '-t', multiple=True, default=RECORD_SPEC)
@click.option('--depiction-type', '-d', multiple=True, default=DEPICTION_PID_LIST)
def main(
    target_class: list = RECORD_SPEC,
    depiction_type: list = DEPICTION_PID_LIST,
) -> None:
    """
- Takes lines of json, each containing a metadata record, as input
- The record is assumed to have a specific structure with regards to inlined
  objects at specific fields, e.g. Person -> depictions -> distributions -> characterized_by
  (this structure can be constructed, for example, with the use of dtc + qri + jq)
- It will then extract download URLs for each depiction of the record, provided
  the depiction has a 'kind' included in the depiction-type argument
- For each depiction distribution, it will output: the record curie, the file extension
  and the url, to stdout
    """
    for line in sys.stdin:
        line = line.strip()
        if not line:
            continue
        try:
            record = json.loads(line)
        except json.JSONDecodeError:
            # skip invalid JSON
            continue
        # skip records that aren't a specified target class
        if record.get('schema_type') not in target_class:
            continue
        # record pid is required for further processing
        pid = record.get('pid')
        if not pid:
            continue
        curie_ref = pid.split(':', 1)[-1]
        depictions = record.get('depictions', [])
        for dep in depictions:
            # default unapproved depiction types to 'depiction'
            depiction_pid = dep.get('kind', 'unknown')
            depiction_kind = DEPICTION_PIDS.get(depiction_pid, 'depiction')
            distributions = dep.get('distributions', [])
            for dist in distributions:
                for char in dist.get('characterized_by', []):
                    if char.get('predicate') != 'dcat:downloadUrl':
                        continue
                    url = char.get('object')
                    if not url:
                        continue
                    ext = get_extension(url)
                    print(f'{depiction_kind}\t{curie_ref}\t{ext}\t{url}')

if __name__ == '__main__':
    main()