knowledge-enrichment/.forgejo/tools/get-depiction-urls.py

82 lines
No EOL
2.9 KiB
Python
Executable file

# /// script
# requires-python = ">=3.12"
# dependencies = [
# "rich-click",
# ]
# ///
"""Generate record-depiction-distribution-urls
"""
import sys
import json
from urllib.parse import urlparse, unquote
from pathlib import Path
import rich_click as click
RECORD_SPEC = ['xyzri:XYZPerson', 'xyzri:XYZInstrument']
DEPICTION_SPEC = {
'logo': 'schema:logo',
'portrait': 'xyzrins:depiction-types/e9a34f7d-d05e-4591-bb45-f8a0c499e07b',
}
DEPICTION_PIDS = dict((v,k) for k,v in DEPICTION_SPEC.items())
DEPICTION_PID_LIST = list(DEPICTION_SPEC.values())
def get_extension(url):
"""
Extract the file extension from a URL.
Ignores query parameters and fragments.
"""
path = urlparse(url).path
return Path(unquote(path)).suffix.lstrip('.')
@click.command()
@click.option('--target-class', '-t', multiple=True, default=RECORD_SPEC)
@click.option('--depiction-type', '-d', multiple=True, default=DEPICTION_PID_LIST)
def main(
target_class: list = RECORD_SPEC,
depiction_type: list = DEPICTION_PID_LIST,
) -> None:
"""
- Takes lines of json, each containing a metadata record, as input
- The record is assumed to have a specific structure with regards to inlined
objects at specific fields, e.g. Person -> depictions -> distributions -> characterized_by
(this structure can be constructed, for example, with the use of dtc + qri + jq)
- It will then extract download URLs for each depiction of the record, provided
the depiction has a 'kind' included in the depiction-type argument
- For each depiction distribution, it will output: the record curie, the file extension
and the url, to stdout
"""
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
# skip invalid JSON
continue
# skip records that aren't a specified target class
if record.get('schema_type') not in target_class:
continue
# record pid is required for further processing
pid = record.get('pid')
if not pid:
continue
curie_ref = pid.split(':', 1)[-1]
depictions = record.get('depictions', [])
for dep in depictions:
# default unapproved depiction types to 'depiction'
depiction_pid = dep.get('kind', 'unknown')
depiction_kind = DEPICTION_PIDS.get(depiction_pid, 'depiction')
distributions = dep.get('distributions', [])
for dist in distributions:
for char in dist.get('characterized_by', []):
if char.get('predicate') != 'dcat:downloadUrl':
continue
url = char.get('object')
if not url:
continue
ext = get_extension(url)
print(f'{depiction_kind}\t{curie_ref}\t{ext}\t{url}')
if __name__ == '__main__':
main()