knowledge-enrichment/.forgejo/tools/get-person-depiction-urls.py

79 lines
No EOL
2.6 KiB
Python
Executable file

# /// script
# requires-python = ">=3.12"
# dependencies = [
# "rich-click",
# ]
# ///
"""Generate person-depiction-distribution-urls
"""
import sys
import json
from urllib.parse import urlparse, unquote
from pathlib import Path
import rich_click as click
def get_extension(url):
"""
Extract the file extension from a URL.
Ignores query parameters and fragments.
"""
path = urlparse(url).path
return Path(unquote(path)).suffix.lstrip('.')
def is_person(record, target_class):
stype = record.get('schema_type', None)
if stype == target_class:
return True
return False
@click.command()
@click.option('--target-class', '-t', default='xyzri:XYZPerson')
@click.option('--depiction-type', '-d', default='xyzrins:depiction-types/e9a34f7d-d05e-4591-bb45-f8a0c499e07b')
def main(
target_class: str = 'xyzri:XYZPerson',
depiction_type: str = 'xyzrins:depiction-types/e9a34f7d-d05e-4591-bb45-f8a0c499e07b',
) -> None:
"""
- Takes lines of json, containing a Person record, as input
- The Person record is assumed to have a specific structure with regards to inlined
objects at specific fields: Person -> depictions -> distributions -> characterized_by
(this structure can be constructed, for example, with the use of dtc + qri + jq)
- It will then extract download URLs for each depiction of the person record, provided
the depiction has the 'kind' specified by the depiction-type argument
- For each depiction distribution, it will output: the person curie, the file extension
and the url to stdout
"""
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
person = json.loads(line)
except json.JSONDecodeError:
# skip invalid JSON
continue
if not is_person(person, target_class):
continue
pid = person.get('pid')
if not pid:
continue
curie_ref = pid.split(':', 1)[-1]
depictions = person.get('depictions', [])
for dep in depictions:
if dep.get('kind') != depiction_type:
continue
distributions = dep.get('distributions', [])
for dist in distributions:
for char in dist.get('characterized_by', []):
if char.get('predicate') != 'dcat:downloadUrl':
continue
url = char.get('object')
if not url:
continue
ext = get_extension(url)
print(f'{curie_ref}\t{ext}\t{url}')
if __name__ == '__main__':
main()