82 lines
No EOL
2.9 KiB
Python
Executable file
82 lines
No EOL
2.9 KiB
Python
Executable file
# /// script
|
|
# requires-python = ">=3.12"
|
|
# dependencies = [
|
|
# "rich-click",
|
|
# ]
|
|
# ///
|
|
"""Generate record-depiction-distribution-urls
|
|
"""
|
|
import sys
|
|
import json
|
|
from urllib.parse import urlparse, unquote
|
|
from pathlib import Path
|
|
import rich_click as click
|
|
|
|
RECORD_SPEC = ['xyzri:XYZPerson', 'xyzri:XYZInstrument']
|
|
DEPICTION_SPEC = {
|
|
'logo': 'schema:logo',
|
|
'portrait': 'xyzrins:depiction-types/e9a34f7d-d05e-4591-bb45-f8a0c499e07b',
|
|
}
|
|
DEPICTION_PIDS = dict((v,k) for k,v in DEPICTION_SPEC.items())
|
|
DEPICTION_PID_LIST = list(DEPICTION_SPEC.values())
|
|
|
|
def get_extension(url):
|
|
"""
|
|
Extract the file extension from a URL.
|
|
Ignores query parameters and fragments.
|
|
"""
|
|
path = urlparse(url).path
|
|
return Path(unquote(path)).suffix.lstrip('.')
|
|
|
|
@click.command()
|
|
@click.option('--target-class', '-t', multiple=True, default=RECORD_SPEC)
|
|
@click.option('--depiction-type', '-d', multiple=True, default=DEPICTION_PID_LIST)
|
|
def main(
|
|
target_class: list = RECORD_SPEC,
|
|
depiction_type: list = DEPICTION_PID_LIST,
|
|
) -> None:
|
|
"""
|
|
- Takes lines of json, each containing a metadata record, as input
|
|
- The record is assumed to have a specific structure with regards to inlined
|
|
objects at specific fields, e.g. Person -> depictions -> distributions -> characterized_by
|
|
(this structure can be constructed, for example, with the use of dtc + qri + jq)
|
|
- It will then extract download URLs for each depiction of the record, provided
|
|
the depiction has a 'kind' included in the depiction-type argument
|
|
- For each depiction distribution, it will output: the record curie, the file extension
|
|
and the url, to stdout
|
|
"""
|
|
for line in sys.stdin:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
record = json.loads(line)
|
|
except json.JSONDecodeError:
|
|
# skip invalid JSON
|
|
continue
|
|
# skip records that aren't a specified target class
|
|
if record.get('schema_type') not in target_class:
|
|
continue
|
|
# record pid is required for further processing
|
|
pid = record.get('pid')
|
|
if not pid:
|
|
continue
|
|
curie_ref = pid.split(':', 1)[-1]
|
|
depictions = record.get('depictions', [])
|
|
for dep in depictions:
|
|
# default unapproved depiction types to 'depiction'
|
|
depiction_pid = dep.get('kind', 'unknown')
|
|
depiction_kind = DEPICTION_PIDS.get(depiction_pid, 'depiction')
|
|
distributions = dep.get('distributions', [])
|
|
for dist in distributions:
|
|
for char in dist.get('characterized_by', []):
|
|
if char.get('predicate') != 'dcat:downloadUrl':
|
|
continue
|
|
url = char.get('object')
|
|
if not url:
|
|
continue
|
|
ext = get_extension(url)
|
|
print(f'{depiction_kind}\t{curie_ref}\t{ext}\t{url}')
|
|
|
|
if __name__ == '__main__':
|
|
main() |