79 lines
No EOL
2.6 KiB
Python
Executable file
79 lines
No EOL
2.6 KiB
Python
Executable file
# /// script
|
|
# requires-python = ">=3.12"
|
|
# dependencies = [
|
|
# "rich-click",
|
|
# ]
|
|
# ///
|
|
"""Generate person-depiction-distribution-urls
|
|
"""
|
|
import sys
|
|
import json
|
|
from urllib.parse import urlparse, unquote
|
|
from pathlib import Path
|
|
import rich_click as click
|
|
|
|
def get_extension(url):
|
|
"""
|
|
Extract the file extension from a URL.
|
|
Ignores query parameters and fragments.
|
|
"""
|
|
path = urlparse(url).path
|
|
return Path(unquote(path)).suffix.lstrip('.')
|
|
|
|
def is_person(record, target_class):
|
|
stype = record.get('schema_type', None)
|
|
if stype == target_class:
|
|
return True
|
|
return False
|
|
|
|
@click.command()
|
|
@click.option('--target-class', '-t', default='xyzri:XYZPerson')
|
|
@click.option('--depiction-type', '-d', default='xyzrins:depiction-types/e9a34f7d-d05e-4591-bb45-f8a0c499e07b')
|
|
def main(
|
|
target_class: str = 'xyzri:XYZPerson',
|
|
depiction_type: str = 'xyzrins:depiction-types/e9a34f7d-d05e-4591-bb45-f8a0c499e07b',
|
|
) -> None:
|
|
"""
|
|
- Takes lines of json, containing a Person record, as input
|
|
- The Person record is assumed to have a specific structure with regards to inlined
|
|
objects at specific fields: Person -> depictions -> distributions -> characterized_by
|
|
(this structure can be constructed, for example, with the use of dtc + qri + jq)
|
|
- It will then extract download URLs for each depiction of the person record, provided
|
|
the depiction has the 'kind' specified by the depiction-type argument
|
|
- For each depiction distribution, it will output: the person curie, the file extension
|
|
and the url to stdout
|
|
"""
|
|
for line in sys.stdin:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
person = json.loads(line)
|
|
except json.JSONDecodeError:
|
|
# skip invalid JSON
|
|
continue
|
|
|
|
if not is_person(person, target_class):
|
|
continue
|
|
|
|
pid = person.get('pid')
|
|
if not pid:
|
|
continue
|
|
curie_ref = pid.split(':', 1)[-1]
|
|
depictions = person.get('depictions', [])
|
|
for dep in depictions:
|
|
if dep.get('kind') != depiction_type:
|
|
continue
|
|
distributions = dep.get('distributions', [])
|
|
for dist in distributions:
|
|
for char in dist.get('characterized_by', []):
|
|
if char.get('predicate') != 'dcat:downloadUrl':
|
|
continue
|
|
url = char.get('object')
|
|
if not url:
|
|
continue
|
|
ext = get_extension(url)
|
|
print(f'{curie_ref}\t{ext}\t{url}')
|
|
|
|
if __name__ == '__main__':
|
|
main() |